net.html: polish module, update docs (#7193)
parent
5fa1e403ec
commit
b952bf2e6b
|
@ -1,118 +1,16 @@
|
||||||
# V HTML
|
net/http is an HTML written in pure V.
|
||||||
|
|
||||||
A HTML parser made in V.
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
```v oksyntax
|
||||||
|
import net.html
|
||||||
|
|
||||||
If the description below isn't enought, please look at the test files.
|
fn main() {
|
||||||
|
doc := html.parse('<html><body><h1 class="title">Hello world!</h1></body></html>')
|
||||||
### Parser
|
tag := doc.get_tag('h1')[0] // <h1>Hello world!</h1>
|
||||||
|
println(tag.name) // h1
|
||||||
Responsible for read HTML in full strings or splited string and returns all Tag objets of
|
println(tag.content) // Hello world!
|
||||||
it HTML or return a DocumentObjectModel, that will try to find how the HTML Tree is.
|
println(tag.attributes) // {'class':'title'}
|
||||||
|
println(tag.str()) // <h1 class="title">Hello world!</h1>
|
||||||
#### split_parse(data string)
|
}
|
||||||
This functions is the main function called by parse method to fragment parse your HTML.
|
```
|
||||||
|
More examples found on [`parser_test.v`](parser_test.v) and [`html_test.v`](html_test.v)
|
||||||
#### parse_html(data string, is_file bool)
|
|
||||||
This function is called passing a filename or a complete html data string to it.
|
|
||||||
|
|
||||||
#### add_code_tag(name string)
|
|
||||||
This function is used to add a tag for the parser ignore it's content.
|
|
||||||
For example, if you have an html or XML with a custom tag, like `<script>`, using this function,
|
|
||||||
like `add_code_tag('script')` will make all `script` tags content be jumped,
|
|
||||||
so you still have its content, but will not confuse the parser with it's `>` or `<`.
|
|
||||||
|
|
||||||
#### finalize()
|
|
||||||
When using **split_parse** method, you must call this function to ends the parse completely.
|
|
||||||
|
|
||||||
#### get_tags() []Tag_ptr
|
|
||||||
This functions returns a array with all tags and it's content.
|
|
||||||
|
|
||||||
#### get_dom() DocumentObjectModel
|
|
||||||
Returns the DocumentObjectModel for current parsed tags.
|
|
||||||
|
|
||||||
### WARNING
|
|
||||||
If you want to reuse parser object to parse another HTML, call `initialize_all()` function first.
|
|
||||||
|
|
||||||
### DocumentObjectModel
|
|
||||||
|
|
||||||
A DOM object that will make easier to access some tags and search it.
|
|
||||||
|
|
||||||
#### get_by_attribute_value(name string, value string) []Tag_ptr
|
|
||||||
This function retuns a Tag array with all tags in document
|
|
||||||
that have a attribute with given name and given value.
|
|
||||||
|
|
||||||
#### get_by_tag(name string) []Tag_ptr
|
|
||||||
This function retuns a Tag array with all tags in document that have a name with the given value.
|
|
||||||
|
|
||||||
#### get_by_attribute(name string) []Tag_ptr
|
|
||||||
This function retuns a Tag array with all tags in document that have a attribute with given name.
|
|
||||||
|
|
||||||
#### get_root() Tag_ptr
|
|
||||||
This function returns the root Tag.
|
|
||||||
|
|
||||||
#### get_all_tags() []Tag_ptr
|
|
||||||
This function returns all important tags, removing close tags.
|
|
||||||
|
|
||||||
### Tag
|
|
||||||
|
|
||||||
An object that holds tags information, such as `name`, `attributes`, `children`.
|
|
||||||
|
|
||||||
#### get_children() []Tag_ptr
|
|
||||||
Returns all children as an array.
|
|
||||||
|
|
||||||
#### get_parent() &Tag
|
|
||||||
Returns the parent of current tag.
|
|
||||||
|
|
||||||
#### get_name() string
|
|
||||||
Returns tag name.
|
|
||||||
|
|
||||||
#### get_content() string
|
|
||||||
Returns tag content.
|
|
||||||
|
|
||||||
#### get_attributes() map[string]string
|
|
||||||
Returns all attributes and it value.
|
|
||||||
|
|
||||||
#### text() string
|
|
||||||
Returns the content of the tag and all tags inside it.
|
|
||||||
Also, any `<br>` tag will be converted into `\n`.
|
|
||||||
|
|
||||||
## Some questions that can appear
|
|
||||||
|
|
||||||
### Q: Why in parser have a `builder_str() string` method that returns only the lexeme string?
|
|
||||||
|
|
||||||
A: Because in early stages of the project, `strings.Builder` are used,
|
|
||||||
but for some bug existing somewhere, it was necessary to use `string` directly.
|
|
||||||
Later, it's planned to use `strings.Builder` again.
|
|
||||||
|
|
||||||
### Q: Why have a `compare_string(a string, b string) bool` method?
|
|
||||||
|
|
||||||
A: For some reason when using != and == in strings directly, it is not working.
|
|
||||||
So this method is a workaround.
|
|
||||||
|
|
||||||
### Q: Will be something like `XPath`?
|
|
||||||
|
|
||||||
A: Like XPath yes. Exactly equal to it, no.
|
|
||||||
|
|
||||||
## Roadmap
|
|
||||||
- [x] Parser
|
|
||||||
- [x] `<!-- Comments -->` detection
|
|
||||||
- [x] `Open Generic tags` detection
|
|
||||||
- [x] `Close Generic tags` detection
|
|
||||||
- [x] `verify string` detection
|
|
||||||
- [x] `tag attributes` detection
|
|
||||||
- [x] `attributes values` detection
|
|
||||||
- [x] `tag text` (on tag it is declared as content, maybe change for text in the future)
|
|
||||||
- [x] `text file for parse` support (open local files for parsing)
|
|
||||||
- [x] `open_code` verification
|
|
||||||
- [x] DocumentObjectModel
|
|
||||||
- [x] push elements that have a close tag into stack
|
|
||||||
- [x] remove elements from stack
|
|
||||||
- [x] ~~create a new document root if have some syntax error (deleted)~~
|
|
||||||
- [x] search tags in `DOM` by attributes
|
|
||||||
- [x] search tags in `DOM` by tag type
|
|
||||||
- [x] finish dom test
|
|
||||||
|
|
||||||
## License
|
|
||||||
[MIT](../../../LICENSE)
|
|
||||||
|
|
|
@ -1,30 +1,35 @@
|
||||||
module html
|
module html
|
||||||
|
|
||||||
#include <limits.h>
|
const (
|
||||||
|
null_element = int(0x80000000)
|
||||||
|
)
|
||||||
|
|
||||||
struct Stack {
|
struct Stack {
|
||||||
null_element int = C.INT_MIN
|
|
||||||
mut:
|
mut:
|
||||||
elements []int
|
elements []int
|
||||||
size int
|
size int
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (stack Stack) is_null(data int) bool {
|
[inline]
|
||||||
return data == stack.null_element
|
fn is_null(data int) bool {
|
||||||
|
return data == null_element
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[inline]
|
||||||
fn (stack Stack) is_empty() bool {
|
fn (stack Stack) is_empty() bool {
|
||||||
return stack.size <= 0
|
return stack.size <= 0
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (stack Stack) peek() int {
|
fn (stack Stack) peek() int {
|
||||||
if !stack.is_empty() {
|
return if !stack.is_empty() {
|
||||||
return stack.elements[stack.size - 1]
|
stack.elements[stack.size - 1]
|
||||||
|
} else {
|
||||||
|
null_element
|
||||||
}
|
}
|
||||||
return stack.null_element
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (mut stack Stack) pop() int {
|
fn (mut stack Stack) pop() int {
|
||||||
mut to_return := stack.null_element
|
mut to_return := null_element
|
||||||
if !stack.is_empty() {
|
if !stack.is_empty() {
|
||||||
to_return = stack.elements[stack.size - 1]
|
to_return = stack.elements[stack.size - 1]
|
||||||
stack.size--
|
stack.size--
|
||||||
|
@ -53,7 +58,6 @@ fn (mut btree BTree) add_children(tag Tag) int {
|
||||||
btree.all_tags << tag
|
btree.all_tags << tag
|
||||||
if btree.all_tags.len > 1 {
|
if btree.all_tags.len > 1 {
|
||||||
for btree.childrens.len <= btree.node_pointer {
|
for btree.childrens.len <= btree.node_pointer {
|
||||||
// println("${btree.childrens.len} <= ${btree.node_pointer}")
|
|
||||||
mut temp_array := btree.childrens
|
mut temp_array := btree.childrens
|
||||||
temp_array << []int{}
|
temp_array << []int{}
|
||||||
btree.childrens = temp_array
|
btree.childrens = temp_array
|
||||||
|
@ -69,14 +73,17 @@ fn (mut btree BTree) add_children(tag Tag) int {
|
||||||
return btree.all_tags.len - 1
|
return btree.all_tags.len - 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[inline]
|
||||||
fn (btree BTree) get_children() []int {
|
fn (btree BTree) get_children() []int {
|
||||||
return btree.childrens[btree.node_pointer]
|
return btree.childrens[btree.node_pointer]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[inline]
|
||||||
fn (btree BTree) get_parent() int {
|
fn (btree BTree) get_parent() int {
|
||||||
return btree.parents[btree.node_pointer]
|
return btree.parents[btree.node_pointer]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[inline]
|
||||||
fn (btree BTree) get_stored() Tag {
|
fn (btree BTree) get_stored() Tag {
|
||||||
return btree.all_tags[btree.node_pointer]
|
return btree.all_tags[btree.node_pointer]
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,6 +2,11 @@ module html
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
// The W3C Document Object Model (DOM) is a platform and language-neutral
|
||||||
|
// interface that allows programs and scripts to dynamically access and
|
||||||
|
// update the content, structure, and style of a document.
|
||||||
|
//
|
||||||
|
// https://www.w3.org/TR/WD-DOM/introduction.html
|
||||||
pub struct DocumentObjectModel {
|
pub struct DocumentObjectModel {
|
||||||
mut:
|
mut:
|
||||||
root &Tag
|
root &Tag
|
||||||
|
@ -25,24 +30,14 @@ fn (mut dom DocumentObjectModel) print_debug(data string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
[inline]
|
||||||
fn (dom mut DocumentObjectModel) new_root(tag &Tag) {
|
|
||||||
mut new_tag := &Tag{} new_tag.name = "div"
|
|
||||||
new_tag.add_child(dom.root) new_tag.add_child(tag)
|
|
||||||
dom.root = new_tag
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
fn is_close_tag(tag &Tag) bool {
|
fn is_close_tag(tag &Tag) bool {
|
||||||
if tag.name.len > 0 {
|
return tag.name.len > 0 && tag.name[0] == `/`
|
||||||
return tag.name[0] == 47 // return if equals to /
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (mut dom DocumentObjectModel) where_is(item_name string, attribute_name string) int {
|
fn (mut dom DocumentObjectModel) where_is(item_name string, attribute_name string) int {
|
||||||
if !(attribute_name in dom.attributes) {
|
if attribute_name !in dom.attributes {
|
||||||
temp_array := []string{}
|
dom.attributes[attribute_name] = []string{}
|
||||||
dom.attributes[attribute_name] = temp_array
|
|
||||||
}
|
}
|
||||||
mut string_array := dom.attributes[attribute_name]
|
mut string_array := dom.attributes[attribute_name]
|
||||||
mut counter := 0
|
mut counter := 0
|
||||||
|
@ -58,10 +53,10 @@ fn (mut dom DocumentObjectModel) where_is(item_name string, attribute_name strin
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (mut dom DocumentObjectModel) add_tag_attribute(tag &Tag) {
|
fn (mut dom DocumentObjectModel) add_tag_attribute(tag &Tag) {
|
||||||
for attribute_name in tag.attributes.keys() {
|
for attribute_name, _ in tag.attributes {
|
||||||
attribute_value := tag.attributes[attribute_name]
|
attribute_value := tag.attributes[attribute_name]
|
||||||
location := dom.where_is(attribute_value, attribute_name)
|
location := dom.where_is(attribute_value, attribute_name)
|
||||||
if !(attribute_name in dom.tag_attributes) {
|
if attribute_name !in dom.tag_attributes {
|
||||||
dom.tag_attributes[attribute_name] = []
|
dom.tag_attributes[attribute_name] = []
|
||||||
}
|
}
|
||||||
for {
|
for {
|
||||||
|
@ -91,7 +86,7 @@ fn (mut dom DocumentObjectModel) add_tag_by_type(tag &Tag) {
|
||||||
|
|
||||||
fn (mut dom DocumentObjectModel) add_tag_by_attribute(tag &Tag) {
|
fn (mut dom DocumentObjectModel) add_tag_by_attribute(tag &Tag) {
|
||||||
for attribute_name in tag.attributes.keys() {
|
for attribute_name in tag.attributes.keys() {
|
||||||
if !(attribute_name in dom.all_attributes) {
|
if attribute_name !in dom.all_attributes {
|
||||||
dom.all_attributes[attribute_name] = [tag]
|
dom.all_attributes[attribute_name] = [tag]
|
||||||
} else {
|
} else {
|
||||||
mut temp_array := dom.all_attributes[attribute_name]
|
mut temp_array := dom.all_attributes[attribute_name]
|
||||||
|
@ -101,22 +96,10 @@ fn (mut dom DocumentObjectModel) add_tag_by_attribute(tag &Tag) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compare_string(a string, b string) bool { // for some reason == doesn't work
|
|
||||||
if a.len != b.len {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
for i := 0; i < a.len; i++ {
|
|
||||||
if a[i] != b[i] {
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) {
|
fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) {
|
||||||
dom.constructed = true
|
dom.constructed = true
|
||||||
mut temp_map := map[string]int{}
|
mut temp_map := map[string]int{}
|
||||||
mut temp_int := C.INT_MIN
|
mut temp_int := null_element
|
||||||
mut temp_string := ''
|
mut temp_string := ''
|
||||||
mut stack := Stack{}
|
mut stack := Stack{}
|
||||||
dom.btree = BTree{}
|
dom.btree = BTree{}
|
||||||
|
@ -130,21 +113,16 @@ fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) {
|
||||||
dom.print_debug(tag.str())
|
dom.print_debug(tag.str())
|
||||||
if is_close_tag(tag) {
|
if is_close_tag(tag) {
|
||||||
temp_int = stack.peek()
|
temp_int = stack.peek()
|
||||||
temp_string = tag.name[1..tag.name.len] // print(temp_string + " != " + tag_list[temp_int].name + " >> ") // println(temp_string != tag_list[temp_int].name)
|
temp_string = tag.name[1..]
|
||||||
for !stack.is_null(temp_int) &&
|
for !is_null(temp_int) && temp_string != tag_list[temp_int].name && !tag_list[temp_int].closed {
|
||||||
!compare_string(temp_string, tag_list[temp_int].name) && !tag_list[temp_int].closed {
|
dom.print_debug(temp_string + ' >> ' + tag_list[temp_int].name + ' ' + (temp_string ==
|
||||||
dom.print_debug(temp_string + ' >> ' + tag_list[temp_int].name + ' ' +
|
tag_list[temp_int].name).str())
|
||||||
compare_string(temp_string, tag_list[temp_int].name).str())
|
|
||||||
stack.pop()
|
stack.pop()
|
||||||
temp_int = stack.peek()
|
temp_int = stack.peek()
|
||||||
}
|
}
|
||||||
temp_int = stack.peek()
|
temp_int = stack.peek()
|
||||||
if !stack.is_null(temp_int) {
|
temp_int = if !is_null(temp_int) { stack.pop() } else { root_index }
|
||||||
temp_int = stack.pop()
|
if is_null(temp_int) {
|
||||||
} else {
|
|
||||||
temp_int = root_index
|
|
||||||
}
|
|
||||||
if stack.is_null(temp_int) {
|
|
||||||
stack.push(root_index)
|
stack.push(root_index)
|
||||||
}
|
}
|
||||||
dom.print_debug('Removed ' + temp_string + ' -- ' + tag_list[temp_int].name)
|
dom.print_debug('Removed ' + temp_string + ' -- ' + tag_list[temp_int].name)
|
||||||
|
@ -154,7 +132,7 @@ fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) {
|
||||||
dom.add_tag_by_type(tag)
|
dom.add_tag_by_type(tag)
|
||||||
dom.all_tags << tag
|
dom.all_tags << tag
|
||||||
temp_int = stack.peek()
|
temp_int = stack.peek()
|
||||||
if !stack.is_null(temp_int) {
|
if !is_null(temp_int) {
|
||||||
dom.btree.move_pointer(temp_map[temp_int.str()])
|
dom.btree.move_pointer(temp_map[temp_int.str()])
|
||||||
temp_map[index.str()] = dom.btree.add_children(tag)
|
temp_map[index.str()] = dom.btree.add_children(tag)
|
||||||
mut temp_tag := tag_list[temp_int]
|
mut temp_tag := tag_list[temp_int]
|
||||||
|
@ -164,8 +142,7 @@ fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) {
|
||||||
dom.print_debug("Added ${tag.name} as child of '" + tag_list[temp_int].name +
|
dom.print_debug("Added ${tag.name} as child of '" + tag_list[temp_int].name +
|
||||||
"' which now has ${dom.btree.get_children().len} childrens")
|
"' which now has ${dom.btree.get_children().len} childrens")
|
||||||
*/
|
*/
|
||||||
dom.print_debug("Added $tag.name as child of '" + temp_tag.name +
|
dom.print_debug("Added $tag.name as child of '" + temp_tag.name + "' which now has $temp_tag.children.len childrens")
|
||||||
"' which now has $temp_tag.get_children().len childrens")
|
|
||||||
} else { // dom.new_root(tag)
|
} else { // dom.new_root(tag)
|
||||||
stack.push(root_index)
|
stack.push(root_index)
|
||||||
}
|
}
|
||||||
|
@ -179,40 +156,40 @@ fn (mut dom DocumentObjectModel) construct(tag_list []&Tag) {
|
||||||
dom.root = tag_list[0]
|
dom.root = tag_list[0]
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn (mut dom DocumentObjectModel) get_by_attribute_value(name string, value string) []&Tag {
|
// get_tag_by_attribute_value retrieves all the tags in the document that has the given attribute name and value.
|
||||||
|
pub fn (mut dom DocumentObjectModel) get_tag_by_attribute_value(name string, value string) []&Tag {
|
||||||
location := dom.where_is(value, name)
|
location := dom.where_is(value, name)
|
||||||
if dom.tag_attributes[name].len > location {
|
return if dom.tag_attributes[name].len > location {
|
||||||
return dom.tag_attributes[name][location]
|
dom.tag_attributes[name][location]
|
||||||
|
} else {
|
||||||
|
[]&Tag{}
|
||||||
}
|
}
|
||||||
return []&Tag{}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn (dom DocumentObjectModel) get_by_tag(name string) []&Tag {
|
// get_tag retrieves all the tags in the document that has the given tag name.
|
||||||
if name in dom.tag_type {
|
pub fn (dom DocumentObjectModel) get_tag(name string) []&Tag {
|
||||||
return dom.tag_type[name]
|
return if name in dom.tag_type {
|
||||||
|
dom.tag_type[name]
|
||||||
|
} else {
|
||||||
|
[]&Tag{}
|
||||||
}
|
}
|
||||||
return []&Tag{}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn (dom DocumentObjectModel) get_by_attribute(name string) []&Tag {
|
// get_tag_by_attribute retrieves all the tags in the document that has the given attribute name.
|
||||||
if name in dom.all_attributes {
|
pub fn (dom DocumentObjectModel) get_tag_by_attribute(name string) []&Tag {
|
||||||
return dom.all_attributes[name]
|
return if name in dom.all_attributes {
|
||||||
|
dom.all_attributes[name]
|
||||||
|
} else {
|
||||||
|
[]&Tag{}
|
||||||
}
|
}
|
||||||
return []&Tag{}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// get_root returns the root of the document.
|
||||||
pub fn (dom DocumentObjectModel) get_root() &Tag {
|
pub fn (dom DocumentObjectModel) get_root() &Tag {
|
||||||
return dom.root
|
return dom.root
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn (dom DocumentObjectModel) get_all_tags() []&Tag {
|
// get_tags returns all of the tags stored in the document.
|
||||||
|
pub fn (dom DocumentObjectModel) get_tags() []&Tag {
|
||||||
return dom.all_tags
|
return dom.all_tags
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
pub fn (dom DocumentObjectModel) get_xpath() XPath {
|
|
||||||
return XPath{
|
|
||||||
dom: dom
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
|
@ -1,63 +1,56 @@
|
||||||
module html
|
module html
|
||||||
|
|
||||||
fn generate_temp_html() string {
|
import strings
|
||||||
mut temp_html := '<!doctype html><html><head><title>Giant String</title></head><body>'
|
|
||||||
for counter := 0; counter < 4; counter++ {
|
|
||||||
temp_html += "<div id='name_$counter' "
|
|
||||||
temp_html += "class='several-$counter'>Look at $counter</div>"
|
|
||||||
}
|
|
||||||
temp_html += '</body></html>'
|
|
||||||
return temp_html
|
|
||||||
}
|
|
||||||
|
|
||||||
fn generate_dom(temp_html string) DocumentObjectModel {
|
fn generate_temp_html() string {
|
||||||
mut parser := Parser{}
|
mut temp_html := strings.new_builder(200)
|
||||||
parser.parse_html(temp_html, false)
|
temp_html.write('<!doctype html><html><head><title>Giant String</title></head><body>')
|
||||||
dom := parser.get_dom()
|
for counter := 0; counter < 4; counter++ {
|
||||||
return dom
|
temp_html.write("<div id='name_$counter' ")
|
||||||
|
temp_html.write("class='several-$counter'>Look at $counter</div>")
|
||||||
|
}
|
||||||
|
temp_html.write('</body></html>')
|
||||||
|
return temp_html.str()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_search_by_tag_type() {
|
fn test_search_by_tag_type() {
|
||||||
dom := generate_dom(generate_temp_html())
|
dom := parse(generate_temp_html())
|
||||||
assert dom.get_by_tag('div').len == 4
|
assert dom.get_tag('div').len == 4
|
||||||
assert dom.get_by_tag('head').len == 1
|
assert dom.get_tag('head').len == 1
|
||||||
assert dom.get_by_tag('body').len == 1
|
assert dom.get_tag('body').len == 1
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_search_by_attribute_value() {
|
fn test_search_by_attribute_value() {
|
||||||
mut dom := generate_dom(generate_temp_html())
|
mut dom := parse(generate_temp_html())
|
||||||
// println(temp_html)
|
// println(temp_html)
|
||||||
print('Amount ')
|
print('Amount ')
|
||||||
println(dom.get_by_attribute_value('id', 'name_0'))
|
println(dom.get_tag_by_attribute_value('id', 'name_0'))
|
||||||
assert dom.get_by_attribute_value('id', 'name_0').len == 1
|
assert dom.get_tag_by_attribute_value('id', 'name_0').len == 1
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_access_parent() {
|
fn test_access_parent() {
|
||||||
mut dom := generate_dom(generate_temp_html())
|
mut dom := parse(generate_temp_html())
|
||||||
div_tags := dom.get_by_tag('div')
|
div_tags := dom.get_tag('div')
|
||||||
assert div_tags[0].get_parent() != C.NULL
|
parent := div_tags[0].parent
|
||||||
/*
|
assert parent != 0
|
||||||
parent := div_tags[0].get_parent()
|
|
||||||
assert parent != C.NULL
|
|
||||||
for div_tag in div_tags {
|
for div_tag in div_tags {
|
||||||
assert div_tag.get_parent() == parent
|
assert div_tag.parent == parent
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_search_by_attributes() {
|
fn test_search_by_attributes() {
|
||||||
dom := generate_dom(generate_temp_html())
|
dom := parse(generate_temp_html())
|
||||||
assert dom.get_by_attribute('id').len == 4
|
assert dom.get_tag_by_attribute('id').len == 4
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_tags_used() {
|
fn test_tags_used() {
|
||||||
dom := generate_dom(generate_temp_html())
|
dom := parse(generate_temp_html())
|
||||||
assert dom.get_all_tags().len == 9
|
assert dom.get_tags().len == 9
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_access_tag_fields() {
|
fn test_access_tag_fields() {
|
||||||
dom := generate_dom(generate_temp_html())
|
dom := parse(generate_temp_html())
|
||||||
id_tags := dom.get_by_attribute('id')
|
id_tags := dom.get_tag_by_attribute('id')
|
||||||
assert id_tags[0].get_name() == "div"
|
assert id_tags[0].name == 'div'
|
||||||
assert id_tags[1].get_attributes()['class'] == "several-1"
|
assert id_tags[1].attributes['class'] == 'several-1'
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
module html
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
// parse parses and returns the DOM from the given text.
|
||||||
|
pub fn parse(text string) DocumentObjectModel {
|
||||||
|
mut parser := Parser{}
|
||||||
|
parser.parse_html(text)
|
||||||
|
return parser.get_dom()
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse_file parses and returns the DOM from the contents of a file.
|
||||||
|
pub fn parse_file(filename string) DocumentObjectModel {
|
||||||
|
content := os.read_file(filename) or { return DocumentObjectModel{
|
||||||
|
root: &Tag{}
|
||||||
|
} }
|
||||||
|
return parse(content)
|
||||||
|
}
|
|
@ -0,0 +1,15 @@
|
||||||
|
module html
|
||||||
|
|
||||||
|
fn test_parse() {
|
||||||
|
doc := parse('<html><body><h1 class="title">Hello world!</h1></body></html>')
|
||||||
|
tags := doc.get_tag('h1')
|
||||||
|
assert tags.len == 1
|
||||||
|
h1_tag := tags[0] // <h1>Hello world!</h1>
|
||||||
|
assert h1_tag.name == 'h1'
|
||||||
|
assert h1_tag.content == 'Hello world!'
|
||||||
|
assert h1_tag.attributes.len == 2
|
||||||
|
// TODO: do not remove. Attributes must not have an empty attr.
|
||||||
|
// assert h1_tag.attributes.len == 1
|
||||||
|
assert h1_tag.str() == '<h1 class="title" >Hello world!</h1>'
|
||||||
|
// assert h1_tag.str() == '<h1 class="title">Hello world!</h1>'
|
||||||
|
}
|
|
@ -1,8 +1,9 @@
|
||||||
module html
|
module html
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import strings
|
||||||
|
|
||||||
struct LexycalAttributes {
|
struct LexicalAttributes {
|
||||||
mut:
|
mut:
|
||||||
current_tag &Tag
|
current_tag &Tag
|
||||||
open_tag bool
|
open_tag bool
|
||||||
|
@ -12,44 +13,40 @@ mut:
|
||||||
is_attribute bool
|
is_attribute bool
|
||||||
opened_code_type string
|
opened_code_type string
|
||||||
line_count int
|
line_count int
|
||||||
lexeme_builder string
|
lexeme_builder strings.Builder = strings.Builder{}
|
||||||
code_tags map[string]bool = {
|
code_tags map[string]bool = {
|
||||||
'script': true
|
'script': true
|
||||||
'style': true
|
'style': true
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
fn (mut lxa LexycalAttributes) write_lexeme(data byte) {
|
|
||||||
mut temp := lxa.lexeme_builder
|
|
||||||
temp += data.str()
|
|
||||||
lxa.lexeme_builder = temp
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Parser is responsible for reading the HTML strings and converting them into a `DocumentObjectModel`.
|
||||||
pub struct Parser {
|
pub struct Parser {
|
||||||
mut:
|
mut:
|
||||||
dom DocumentObjectModel
|
dom DocumentObjectModel
|
||||||
lexycal_attributes LexycalAttributes = LexycalAttributes{
|
lexical_attributes LexicalAttributes = LexicalAttributes{
|
||||||
current_tag: &Tag{}
|
current_tag: &Tag{}
|
||||||
}
|
}
|
||||||
filename string = 'direct-parse'
|
filename string = 'direct-parse'
|
||||||
initialized bool
|
initialized bool
|
||||||
tags []&Tag
|
tags []&Tag
|
||||||
debug_file os.File
|
debug_file os.File
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This function is used to add a tag for the parser ignore it's content.
|
||||||
|
// For example, if you have an html or XML with a custom tag, like `<script>`, using this function,
|
||||||
|
// like `add_code_tag('script')` will make all `script` tags content be jumped,
|
||||||
|
// so you still have its content, but will not confuse the parser with it's `>` or `<`.
|
||||||
pub fn (mut parser Parser) add_code_tag(name string) {
|
pub fn (mut parser Parser) add_code_tag(name string) {
|
||||||
if parser.lexycal_attributes.code_tags.keys().len <= 0 {
|
if name.len <= 0 {
|
||||||
parser.lexycal_attributes.code_tags = map[string]bool{}
|
return
|
||||||
parser.lexycal_attributes.code_tags['script'] = true
|
|
||||||
parser.lexycal_attributes.code_tags['style'] = true
|
|
||||||
}
|
|
||||||
if name.len > 0 {
|
|
||||||
parser.lexycal_attributes.code_tags[name] = true
|
|
||||||
}
|
}
|
||||||
|
parser.lexical_attributes.code_tags[name] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[inline]
|
||||||
fn (parser Parser) builder_str() string {
|
fn (parser Parser) builder_str() string {
|
||||||
return parser.lexycal_attributes.lexeme_builder
|
return parser.lexical_attributes.lexeme_builder.after(0)
|
||||||
}
|
}
|
||||||
|
|
||||||
[if debug]
|
[if debug]
|
||||||
|
@ -65,28 +62,28 @@ fn (mut parser Parser) verify_end_comment(remove bool) bool {
|
||||||
lexeme := parser.builder_str()
|
lexeme := parser.builder_str()
|
||||||
last := lexeme[lexeme.len - 1]
|
last := lexeme[lexeme.len - 1]
|
||||||
penultimate := lexeme[lexeme.len - 2]
|
penultimate := lexeme[lexeme.len - 2]
|
||||||
mut is_end_comment := false
|
is_end_comment := last == `-` && penultimate == `-`
|
||||||
if last.str() == '-' && penultimate.str() == '-' {
|
|
||||||
is_end_comment = true
|
|
||||||
}
|
|
||||||
if is_end_comment && remove {
|
if is_end_comment && remove {
|
||||||
temp := parser.lexycal_attributes.lexeme_builder
|
parser.lexical_attributes.lexeme_builder.go_back(2)
|
||||||
parser.lexycal_attributes.lexeme_builder = temp[0..temp.len - 2]
|
|
||||||
}
|
}
|
||||||
return is_end_comment
|
return is_end_comment
|
||||||
}
|
}
|
||||||
|
|
||||||
fn blank_string(data string) bool {
|
fn blank_string(data string) bool {
|
||||||
mut count := 0
|
mut count := 0
|
||||||
for word in data {
|
for chr in data {
|
||||||
if word == 9 || word == 32 {
|
if chr == 9 || chr == 32 {
|
||||||
count++
|
count++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return count == data.len
|
return count == data.len
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (mut parser Parser) initialize_all() {
|
// init initializes the parser.
|
||||||
|
fn (mut parser Parser) init() {
|
||||||
|
if parser.initialized {
|
||||||
|
return
|
||||||
|
}
|
||||||
parser.dom = DocumentObjectModel{
|
parser.dom = DocumentObjectModel{
|
||||||
debug_file: parser.debug_file
|
debug_file: parser.debug_file
|
||||||
root: &Tag{}
|
root: &Tag{}
|
||||||
|
@ -94,181 +91,165 @@ fn (mut parser Parser) initialize_all() {
|
||||||
parser.add_code_tag('')
|
parser.add_code_tag('')
|
||||||
parser.tags = []&Tag{}
|
parser.tags = []&Tag{}
|
||||||
parser.dom.close_tags['/!document'] = true
|
parser.dom.close_tags['/!document'] = true
|
||||||
parser.lexycal_attributes.current_tag = &Tag{}
|
parser.lexical_attributes.current_tag = &Tag{}
|
||||||
parser.initialized = true
|
parser.initialized = true
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (mut parser Parser) generate_tag() {
|
fn (mut parser Parser) generate_tag() {
|
||||||
if !parser.lexycal_attributes.open_tag {
|
if parser.lexical_attributes.open_tag {
|
||||||
if parser.lexycal_attributes.current_tag.name.len > 0 ||
|
return
|
||||||
parser.lexycal_attributes.current_tag.content.len > 0 {
|
|
||||||
parser.tags << parser.lexycal_attributes.current_tag
|
|
||||||
}
|
}
|
||||||
parser.lexycal_attributes.current_tag = &Tag{}
|
if parser.lexical_attributes.current_tag.name.len > 0 ||
|
||||||
|
parser.lexical_attributes.current_tag.content.len > 0 {
|
||||||
|
parser.tags << parser.lexical_attributes.current_tag
|
||||||
}
|
}
|
||||||
|
parser.lexical_attributes.current_tag = &Tag{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// split_parse parses the HTML fragment
|
||||||
pub fn (mut parser Parser) split_parse(data string) {
|
pub fn (mut parser Parser) split_parse(data string) {
|
||||||
if !parser.initialized {
|
parser.init()
|
||||||
parser.initialize_all()
|
for chr in data {
|
||||||
}
|
// returns true if byte is a " or '
|
||||||
for word in data {
|
is_quote := chr == `"` || chr == `\'`
|
||||||
mut is_quotation := false // " or '
|
string_code := match chr {
|
||||||
if word == 34 || word == 39 {
|
`"` { 1 } // "
|
||||||
is_quotation = true
|
`\'` { 2 } // '
|
||||||
}
|
|
||||||
string_code := match word {
|
|
||||||
34 { 1 } // "
|
|
||||||
39 { 2 } // '
|
|
||||||
else { 0 }
|
else { 0 }
|
||||||
}
|
}
|
||||||
if parser.lexycal_attributes.open_code { // here will verify all needed to know if open_code finishes and string in code
|
if parser.lexical_attributes.open_code { // here will verify all needed to know if open_code finishes and string in code
|
||||||
parser.lexycal_attributes.write_lexeme(word)
|
parser.lexical_attributes.lexeme_builder.write_b(chr)
|
||||||
if parser.lexycal_attributes.open_string > 0 {
|
if parser.lexical_attributes.open_string > 0 &&
|
||||||
if parser.lexycal_attributes.open_string == string_code {
|
parser.lexical_attributes.open_string == string_code {
|
||||||
parser.lexycal_attributes.open_string = 0
|
parser.lexical_attributes.open_string = 0
|
||||||
}
|
} else if is_quote {
|
||||||
} else if is_quotation {
|
parser.lexical_attributes.open_string = string_code
|
||||||
parser.lexycal_attributes.open_string = string_code
|
} else if chr == `>` { // only execute verification if is a > // here will verify < to know if code tag is finished
|
||||||
} else if word == 62 { // only execute verification if is a > // here will verify < to know if code tag is finished
|
name_close_tag := '</$parser.lexical_attributes.opened_code_type>'
|
||||||
name_close_tag := '</' + parser.lexycal_attributes.opened_code_type + '>'
|
if parser.builder_str().to_lower().ends_with(name_close_tag) {
|
||||||
temp_string := parser.builder_str()
|
parser.lexical_attributes.open_code = false
|
||||||
if temp_string.to_lower().ends_with(name_close_tag) {
|
|
||||||
parser.lexycal_attributes.open_code = false
|
|
||||||
// need to modify lexeme_builder to add script text as a content in next loop (not gave error in dom)
|
// need to modify lexeme_builder to add script text as a content in next loop (not gave error in dom)
|
||||||
parser.lexycal_attributes.lexeme_builder = temp_string[0..temp_string.len -
|
parser.lexical_attributes.lexeme_builder.go_back(name_close_tag.len)
|
||||||
name_close_tag.len]
|
parser.lexical_attributes.current_tag.closed = true
|
||||||
parser.lexycal_attributes.current_tag.closed = true
|
parser.lexical_attributes.current_tag.close_type = .new_tag
|
||||||
parser.lexycal_attributes.current_tag.close_type = .new_tag
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if parser.lexycal_attributes.open_comment {
|
} else if parser.lexical_attributes.open_comment {
|
||||||
if word == 62 && parser.verify_end_comment(false) { // close tag '>'
|
if chr == `>` && parser.verify_end_comment(false) { // close tag '>'
|
||||||
// parser.print_debug(parser.builder_str() + " >> " + parser.lexycal_attributes.line_count.str())
|
// parser.print_debug(parser.builder_str() + " >> " + parser.lexical_attributes.line_count.str())
|
||||||
parser.lexycal_attributes.lexeme_builder = '' // strings.Builder{}
|
parser.lexical_attributes.lexeme_builder.go_back_to(0)
|
||||||
parser.lexycal_attributes.open_comment = false
|
parser.lexical_attributes.open_comment = false
|
||||||
parser.lexycal_attributes.open_tag = false
|
parser.lexical_attributes.open_tag = false
|
||||||
} else {
|
} else {
|
||||||
parser.lexycal_attributes.write_lexeme(word)
|
parser.lexical_attributes.lexeme_builder.write_b(chr)
|
||||||
}
|
}
|
||||||
} else if parser.lexycal_attributes.open_string > 0 {
|
} else if parser.lexical_attributes.open_string > 0 {
|
||||||
if parser.lexycal_attributes.open_string == string_code {
|
if parser.lexical_attributes.open_string == string_code {
|
||||||
parser.lexycal_attributes.open_string = 0
|
parser.lexical_attributes.open_string = 0
|
||||||
parser.lexycal_attributes.write_lexeme(word)
|
parser.lexical_attributes.lexeme_builder.write_b(chr)
|
||||||
temp_lexeme := parser.builder_str()
|
temp_lexeme := parser.builder_str()
|
||||||
if parser.lexycal_attributes.current_tag.last_attribute != '' {
|
if parser.lexical_attributes.current_tag.last_attribute != '' {
|
||||||
lattr := parser.lexycal_attributes.current_tag.last_attribute
|
lattr := parser.lexical_attributes.current_tag.last_attribute
|
||||||
nval := temp_lexeme.substr(1, temp_lexeme.len - 1)
|
nval := temp_lexeme.substr(1, temp_lexeme.len - 1)
|
||||||
// parser.print_debug(lattr + " = " + temp_lexeme)
|
// parser.print_debug(lattr + " = " + temp_lexeme)
|
||||||
parser.lexycal_attributes.current_tag.attributes[lattr] = nval
|
parser.lexical_attributes.current_tag.attributes[lattr] = nval
|
||||||
parser.lexycal_attributes.current_tag.last_attribute = ''
|
parser.lexical_attributes.current_tag.last_attribute = ''
|
||||||
} else {
|
} else {
|
||||||
parser.lexycal_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' // parser.print_debug(temp_lexeme)
|
parser.lexical_attributes.current_tag.attributes[temp_lexeme.to_lower()] = '' // parser.print_debug(temp_lexeme)
|
||||||
}
|
}
|
||||||
parser.lexycal_attributes.lexeme_builder = ''
|
parser.lexical_attributes.lexeme_builder.go_back_to(0)
|
||||||
} else {
|
} else {
|
||||||
parser.lexycal_attributes.write_lexeme(word)
|
parser.lexical_attributes.lexeme_builder.write_b(chr)
|
||||||
}
|
}
|
||||||
} else if parser.lexycal_attributes.open_tag {
|
} else if parser.lexical_attributes.open_tag {
|
||||||
if parser.lexycal_attributes.lexeme_builder.len == 0 && is_quotation {
|
if parser.lexical_attributes.lexeme_builder.len == 0 && is_quote {
|
||||||
parser.lexycal_attributes.open_string = string_code
|
parser.lexical_attributes.open_string = string_code
|
||||||
parser.lexycal_attributes.write_lexeme(word)
|
parser.lexical_attributes.lexeme_builder.write_b(chr)
|
||||||
} else if word == 62 { // close tag >
|
} else if chr == `>` { // close tag >
|
||||||
complete_lexeme := parser.builder_str().to_lower()
|
complete_lexeme := parser.builder_str().to_lower()
|
||||||
parser.lexycal_attributes.current_tag.closed = (complete_lexeme.len > 0 &&
|
parser.lexical_attributes.current_tag.closed = (complete_lexeme.len > 0 &&
|
||||||
complete_lexeme[complete_lexeme.len - 1] == 47) // if equals to /
|
complete_lexeme[complete_lexeme.len - 1] == `/`) // if equals to /
|
||||||
if complete_lexeme.len > 0 && complete_lexeme[0] == 47 {
|
if complete_lexeme.len > 0 && complete_lexeme[0] == `/` {
|
||||||
parser.dom.close_tags[complete_lexeme] = true
|
parser.dom.close_tags[complete_lexeme] = true
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
|
else if complete_lexeme.len > 0 && complete_lexeme[complete_lexeme.len - 1] == 47 { // if end tag like "/>"
|
||||||
parser.lexycal_attributes.current_tag.closed = true
|
parser.lexical_attributes.current_tag.closed = true
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
if parser.lexycal_attributes.current_tag.name == '' {
|
if parser.lexical_attributes.current_tag.name == '' {
|
||||||
parser.lexycal_attributes.current_tag.name = complete_lexeme
|
parser.lexical_attributes.current_tag.name = complete_lexeme
|
||||||
} else if complete_lexeme != '/' {
|
} else if complete_lexeme != '/' {
|
||||||
parser.lexycal_attributes.current_tag.attributes[complete_lexeme] = ''
|
parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
|
||||||
}
|
}
|
||||||
parser.lexycal_attributes.open_tag = false
|
parser.lexical_attributes.open_tag = false
|
||||||
parser.lexycal_attributes.lexeme_builder = '' // if tag name is code
|
parser.lexical_attributes.lexeme_builder.go_back_to(0) // if tag name is code
|
||||||
if parser.lexycal_attributes.current_tag.name in parser.lexycal_attributes.code_tags {
|
if parser.lexical_attributes.current_tag.name in parser.lexical_attributes.code_tags {
|
||||||
parser.lexycal_attributes.open_code = true
|
parser.lexical_attributes.open_code = true
|
||||||
parser.lexycal_attributes.opened_code_type = parser.lexycal_attributes.current_tag.name
|
parser.lexical_attributes.opened_code_type = parser.lexical_attributes.current_tag.name
|
||||||
}
|
}
|
||||||
// parser.print_debug(parser.lexycal_attributes.current_tag.name)
|
// parser.print_debug(parser.lexical_attributes.current_tag.name)
|
||||||
} else if word != 9 && word != 32 && word != 61 && word != 10 { // Tab, space, = and \n
|
} else if chr !in [byte(9), ` `, `=`, `\n`] { // Tab, space, = and \n
|
||||||
parser.lexycal_attributes.write_lexeme(word)
|
parser.lexical_attributes.lexeme_builder.write_b(chr)
|
||||||
} else if word != 10 {
|
} else if chr != 10 {
|
||||||
complete_lexeme := parser.builder_str().to_lower()
|
complete_lexeme := parser.builder_str().to_lower()
|
||||||
if parser.lexycal_attributes.current_tag.name == '' {
|
if parser.lexical_attributes.current_tag.name == '' {
|
||||||
parser.lexycal_attributes.current_tag.name = complete_lexeme
|
parser.lexical_attributes.current_tag.name = complete_lexeme
|
||||||
} else {
|
} else {
|
||||||
parser.lexycal_attributes.current_tag.attributes[complete_lexeme] = ''
|
parser.lexical_attributes.current_tag.attributes[complete_lexeme] = ''
|
||||||
parser.lexycal_attributes.current_tag.last_attribute = ''
|
parser.lexical_attributes.current_tag.last_attribute = ''
|
||||||
if word == 61 { // if was a =
|
if chr == `=` { // if was a =
|
||||||
parser.lexycal_attributes.current_tag.last_attribute = complete_lexeme
|
parser.lexical_attributes.current_tag.last_attribute = complete_lexeme
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
parser.lexycal_attributes.lexeme_builder = '' // strings.Builder{}
|
parser.lexical_attributes.lexeme_builder.go_back_to(0)
|
||||||
}
|
}
|
||||||
if parser.builder_str() == '!--' {
|
if parser.builder_str() == '!--' {
|
||||||
parser.lexycal_attributes.open_comment = true
|
parser.lexical_attributes.open_comment = true
|
||||||
}
|
}
|
||||||
} else if word == 60 { // open tag '<'
|
} else if chr == `<` { // open tag '<'
|
||||||
temp_string := parser.builder_str()
|
temp_string := parser.builder_str()
|
||||||
if parser.lexycal_attributes.lexeme_builder.len >= 1 {
|
if parser.lexical_attributes.lexeme_builder.len >= 1 {
|
||||||
if parser.lexycal_attributes.current_tag.name.len > 1 &&
|
if parser.lexical_attributes.current_tag.name.len > 1 &&
|
||||||
parser.lexycal_attributes.current_tag.name[0] == 47 && !blank_string(temp_string) {
|
parser.lexical_attributes.current_tag.name[0] == 47 && !blank_string(temp_string) {
|
||||||
parser.tags << &Tag{
|
parser.tags << &Tag{
|
||||||
name: 'text'
|
name: 'text'
|
||||||
content: temp_string
|
content: temp_string
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
parser.lexycal_attributes.current_tag.content = temp_string // verify later who has this content
|
parser.lexical_attributes.current_tag.content = temp_string // verify later who has this content
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// parser.print_debug(parser.lexycal_attributes.current_tag.str())
|
// parser.print_debug(parser.lexical_attributes.current_tag.str())
|
||||||
parser.lexycal_attributes.lexeme_builder = ''
|
parser.lexical_attributes.lexeme_builder.go_back_to(0)
|
||||||
parser.generate_tag()
|
parser.generate_tag()
|
||||||
parser.lexycal_attributes.open_tag = true
|
parser.lexical_attributes.open_tag = true
|
||||||
} else {
|
} else {
|
||||||
parser.lexycal_attributes.write_lexeme(word)
|
parser.lexical_attributes.lexeme_builder.write_b(chr)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn (mut parser Parser) parse_html(data string, is_file bool) {
|
// parse_html parses the given HTML string
|
||||||
if !parser.initialized {
|
pub fn (mut parser Parser) parse_html(data string) {
|
||||||
parser.initialize_all()
|
parser.init()
|
||||||
}
|
mut lines := data.split_into_lines()
|
||||||
mut lines := []string{}
|
|
||||||
if is_file {
|
|
||||||
file_lines := os.read_lines(data) or {
|
|
||||||
eprintln('failed to read the file $data')
|
|
||||||
return
|
|
||||||
}
|
|
||||||
lines = file_lines
|
|
||||||
} else {
|
|
||||||
lines = data.split_into_lines()
|
|
||||||
}
|
|
||||||
for line in lines {
|
for line in lines {
|
||||||
parser.lexycal_attributes.line_count++
|
parser.lexical_attributes.line_count++
|
||||||
parser.split_parse(line)
|
parser.split_parse(line)
|
||||||
}
|
}
|
||||||
parser.generate_tag()
|
parser.generate_tag()
|
||||||
parser.dom.debug_file = parser.debug_file
|
parser.dom.debug_file = parser.debug_file
|
||||||
parser.dom.construct(parser.tags) // println(parser.close_tags.keys())
|
parser.dom.construct(parser.tags)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// finalize finishes the parsing stage .
|
||||||
|
[inline]
|
||||||
pub fn (mut parser Parser) finalize() {
|
pub fn (mut parser Parser) finalize() {
|
||||||
parser.generate_tag()
|
parser.generate_tag()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn (parser Parser) get_tags() []&Tag {
|
// get_dom returns the parser's current DOM representation.
|
||||||
return parser.tags
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn (mut parser Parser) get_dom() DocumentObjectModel {
|
pub fn (mut parser Parser) get_dom() DocumentObjectModel {
|
||||||
if !parser.dom.constructed {
|
if !parser.dom.constructed {
|
||||||
parser.generate_tag()
|
parser.generate_tag()
|
||||||
|
@ -276,10 +257,3 @@ pub fn (mut parser Parser) get_dom() DocumentObjectModel {
|
||||||
}
|
}
|
||||||
return parser.dom
|
return parser.dom
|
||||||
}
|
}
|
||||||
|
|
||||||
/*pub fn (mut parser Parser) get_xpath() XPath {
|
|
||||||
dom := parser.get_dom()
|
|
||||||
return XPath{
|
|
||||||
dom: dom
|
|
||||||
}
|
|
||||||
}*/
|
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
module html
|
module html
|
||||||
|
|
||||||
//import net.http
|
import strings
|
||||||
|
|
||||||
fn test_split_parse() {
|
fn test_split_parse() {
|
||||||
mut parser := Parser{}
|
mut parser := Parser{}
|
||||||
parser.initialize_all()
|
parser.init()
|
||||||
parser.split_parse('<!doctype htm')
|
parser.split_parse('<!doctype htm')
|
||||||
parser.split_parse('l public')
|
parser.split_parse('l public')
|
||||||
parser.split_parse('><html><he')
|
parser.split_parse('><html><he')
|
||||||
|
@ -16,37 +16,26 @@ fn test_split_parse() {
|
||||||
parser.split_parse('Nice Test!</h3>')
|
parser.split_parse('Nice Test!</h3>')
|
||||||
parser.split_parse('</bo\n\n\ndy></html>')
|
parser.split_parse('</bo\n\n\ndy></html>')
|
||||||
parser.finalize()
|
parser.finalize()
|
||||||
assert parser.get_tags().len == 11
|
assert parser.tags.len == 11
|
||||||
assert parser.get_tags()[3].get_content() == ' Hum... A Tit\nle'
|
assert parser.tags[3].content == ' Hum... A Tit\nle'
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_giant_string() {
|
fn test_giant_string() {
|
||||||
mut temp_html := '<!doctype html><html><head><title>Giant String</title></head><body>'
|
mut temp_html := strings.new_builder(200)
|
||||||
for counter := 0; counter < 2000; counter++ {
|
|
||||||
temp_html += "<div id='name_$counter' class='several-$counter'>Look at $counter</div>"
|
|
||||||
}
|
|
||||||
temp_html += '</body></html>'
|
|
||||||
mut parser := Parser{}
|
mut parser := Parser{}
|
||||||
parser.parse_html(temp_html, false)
|
temp_html.write('<!doctype html><html><head><title>Giant String</title></head><body>')
|
||||||
assert parser.get_tags().len == 4009
|
for counter := 0; counter < 2000; counter++ {
|
||||||
|
temp_html.write("<div id='name_$counter' class='several-$counter'>Look at $counter</div>")
|
||||||
|
}
|
||||||
|
temp_html.write('</body></html>')
|
||||||
|
parser.parse_html(temp_html.str())
|
||||||
|
assert parser.tags.len == 4009
|
||||||
}
|
}
|
||||||
|
|
||||||
fn test_script_tag() {
|
fn test_script_tag() {
|
||||||
temp_html := "<html><body><script>\nvar googletag = googletag || {};\n
|
|
||||||
googletag.cmd = googletag.cmd || [];if(3 > 5) {console.log('Birl');}\n</script></body></html>"
|
|
||||||
mut parser := Parser{}
|
mut parser := Parser{}
|
||||||
parser.parse_html(temp_html, false)
|
script_content := "\nvar googletag = googletag || {};\ngoogletag.cmd = googletag.cmd || [];if(3 > 5) {console.log(\'Birl\');}\n"
|
||||||
assert parser.get_tags()[2].get_content().len == 101
|
temp_html := '<html><body><script>$script_content</script></body></html>'
|
||||||
|
parser.parse_html(temp_html)
|
||||||
|
assert parser.tags[2].content.len == script_content.replace('\n', '').len
|
||||||
}
|
}
|
||||||
|
|
||||||
/*fn test_download_source() {
|
|
||||||
println('Fetching github data in pastebin')
|
|
||||||
resp := http.get('https://pastebin.com/raw/5snUQgqN') or {
|
|
||||||
println('failed to fetch data from the server')
|
|
||||||
return
|
|
||||||
}
|
|
||||||
println('Finalized fetching, start parsing')
|
|
||||||
mut parser := Parser{}
|
|
||||||
parser.parse_html(resp.text, false)
|
|
||||||
assert parser.get_tags().len == 2244
|
|
||||||
}*/
|
|
||||||
|
|
|
@ -1,20 +1,22 @@
|
||||||
module html
|
module html
|
||||||
|
|
||||||
|
import strings
|
||||||
|
|
||||||
enum CloseTagType {
|
enum CloseTagType {
|
||||||
in_name
|
in_name
|
||||||
new_tag
|
new_tag
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Tag holds the information of an HTML tag.
|
||||||
[ref_only]
|
[ref_only]
|
||||||
pub struct Tag {
|
pub struct Tag {
|
||||||
pub mut:
|
pub mut:
|
||||||
name string
|
name string
|
||||||
content string
|
content string
|
||||||
children []&Tag
|
children []&Tag
|
||||||
mut:
|
|
||||||
attributes map[string]string // attributes will be like map[name]value
|
attributes map[string]string // attributes will be like map[name]value
|
||||||
last_attribute string
|
last_attribute string
|
||||||
parent &Tag = C.NULL
|
parent &Tag = 0
|
||||||
position_in_parent int
|
position_in_parent int
|
||||||
closed bool
|
closed bool
|
||||||
close_type CloseTagType = .in_name
|
close_type CloseTagType = .in_name
|
||||||
|
@ -26,62 +28,45 @@ fn (mut tag Tag) add_parent(t &Tag, position int) {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn (mut tag Tag) add_child(t &Tag) int {
|
fn (mut tag Tag) add_child(t &Tag) int {
|
||||||
mut children := tag.children
|
tag.children << t
|
||||||
children << t
|
|
||||||
tag.children = children
|
|
||||||
return tag.children.len
|
return tag.children.len
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn (tag Tag) get_children() []&Tag {
|
// text returns the text contents of the tag.
|
||||||
return tag.children
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn (tag Tag) get_parent() &Tag {
|
|
||||||
return tag.parent
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn (tag Tag) get_name() string {
|
|
||||||
return tag.name
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn (tag Tag) get_content() string {
|
|
||||||
return tag.content
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn (tag Tag) get_attributes() map[string]string {
|
|
||||||
return tag.attributes
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn (tag Tag) text() string {
|
pub fn (tag Tag) text() string {
|
||||||
if tag.name.len >= 2 && tag.name[0..2] == 'br' {
|
if tag.name.len >= 2 && tag.name[..2] == 'br' {
|
||||||
return '\n'
|
return '\n'
|
||||||
}
|
}
|
||||||
mut to_return := tag.content.replace('\n', '')
|
mut text_str := strings.new_builder(200)
|
||||||
for index := 0; index < tag.children.len; index++ {
|
text_str.write(tag.content.replace('\n', ''))
|
||||||
to_return += tag.children[index].text()
|
for child in tag.children {
|
||||||
|
text_str.write(child.text())
|
||||||
}
|
}
|
||||||
return to_return
|
return text_str.str()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn (tag &Tag) str() string {
|
pub fn (tag &Tag) str() string {
|
||||||
mut to_return := '<$tag.name'
|
mut html_str := strings.new_builder(200)
|
||||||
for key in tag.attributes.keys() {
|
html_str.write('<$tag.name')
|
||||||
to_return += ' $key'
|
for key, value in tag.attributes {
|
||||||
value := tag.attributes[key]
|
html_str.write(' $key')
|
||||||
if value.len > 0 {
|
if value.len > 0 {
|
||||||
to_return += '=' + '"${tag.attributes[key]}"'
|
html_str.write('="$value"')
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
to_return += if tag.closed && tag.close_type == .in_name { '/>' } else { '>' }
|
html_str.write(if tag.closed && tag.close_type == .in_name {
|
||||||
to_return += '$tag.content'
|
'/>'
|
||||||
|
} else {
|
||||||
|
'>'
|
||||||
|
})
|
||||||
|
html_str.write(tag.content)
|
||||||
if tag.children.len > 0 {
|
if tag.children.len > 0 {
|
||||||
// println('${tag.name} have ${tag.children.len} childrens')
|
for child in tag.children {
|
||||||
for index := 0; index < tag.children.len; index++ {
|
html_str.write(child.str())
|
||||||
to_return += tag.get_children()[index].str()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if !tag.closed || tag.close_type == .new_tag {
|
if !tag.closed || tag.close_type == .new_tag {
|
||||||
to_return += '</$tag.name>'
|
html_str.write('</$tag.name>')
|
||||||
}
|
}
|
||||||
return to_return
|
return html_str.str()
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue