189 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			V
		
	
	
			
		
		
	
	
			189 lines
		
	
	
		
			4.5 KiB
		
	
	
	
		
			V
		
	
	
| // Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved.
 | |
| // Use of this source code is governed by an MIT license
 | |
| // that can be found in the LICENSE file.
 | |
| module builtin
 | |
| 
 | |
| pub fn utf8_char_len(b byte) int {
 | |
| 	return ((0xe5000000 >> ((b >> 3) & 0x1e)) & 3) + 1
 | |
| }
 | |
| 
 | |
| // Convert utf32 to utf8
 | |
| // utf32 == Codepoint
 | |
| pub fn utf32_to_str(code u32) string {
 | |
| 	unsafe {
 | |
| 		mut buffer := malloc(5)
 | |
| 		return utf32_to_str_no_malloc(code, buffer)
 | |
| 	}
 | |
| }
 | |
| 
 | |
| [unsafe]
 | |
| pub fn utf32_to_str_no_malloc(code u32, buf voidptr) string {
 | |
| 	icode := int(code) // Prevents doing casts everywhere
 | |
| 	mut res := ''
 | |
| 	unsafe {
 | |
| 		mut buffer := &byte(buf)
 | |
| 		if icode <= 127 {
 | |
| 			// 0x7F
 | |
| 			buffer[0] = byte(icode)
 | |
| 			buffer[1] = 0
 | |
| 			res = tos(buffer, 1)
 | |
| 		} else if icode <= 2047 {
 | |
| 			// 0x7FF
 | |
| 			buffer[0] = 192 | byte(icode >> 6) // 0xC0 - 110xxxxx
 | |
| 			buffer[1] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
 | |
| 			buffer[2] = 0
 | |
| 			res = tos(buffer, 2)
 | |
| 		} else if icode <= 65535 {
 | |
| 			// 0xFFFF
 | |
| 			buffer[0] = 224 | byte(icode >> 12) // 0xE0 - 1110xxxx
 | |
| 			buffer[1] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
 | |
| 			buffer[2] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
 | |
| 			buffer[3] = 0
 | |
| 			res = tos(buffer, 3)
 | |
| 		}
 | |
| 		// 0x10FFFF
 | |
| 		else if icode <= 1114111 {
 | |
| 			buffer[0] = 240 | byte(icode >> 18) // 0xF0 - 11110xxx
 | |
| 			buffer[1] = 128 | (byte(icode >> 12) & 63) // 0x80 - 0x3F - 10xxxxxx
 | |
| 			buffer[2] = 128 | (byte(icode >> 6) & 63) // 0x80 - 0x3F - 10xxxxxx
 | |
| 			buffer[3] = 128 | byte(icode & 63) // 0x80 - 0x3F - 10xxxxxx
 | |
| 			buffer[4] = 0
 | |
| 			res = tos(buffer, 4)
 | |
| 		}
 | |
| 	}
 | |
| 	res.is_lit = 1 // let autofree know this string doesn't have to be freed
 | |
| 	return res
 | |
| }
 | |
| 
 | |
| // Convert utf8 to utf32
 | |
| pub fn (_rune string) utf32_code() int {
 | |
| 	if _rune.len == 0 {
 | |
| 		return 0
 | |
| 	}
 | |
| 	// save ASC symbol as is
 | |
| 	if _rune.len == 1 {
 | |
| 		return int(_rune[0])
 | |
| 	}
 | |
| 	mut b := byte(int(_rune[0]))
 | |
| 	// TODO should be
 | |
| 	// res := int( rune[0] << rune.len)
 | |
| 	b = b << _rune.len
 | |
| 	mut res := int(b)
 | |
| 	mut shift := 6 - _rune.len
 | |
| 	for i := 1; i < _rune.len; i++ {
 | |
| 		c := int(_rune[i])
 | |
| 		res = res << shift
 | |
| 		res |= c & 63 // 0x3f
 | |
| 		shift = 6
 | |
| 	}
 | |
| 	return res
 | |
| }
 | |
| 
 | |
| // Calculate length to read from the first byte
 | |
| fn utf8_len(c byte) int {
 | |
| 	mut b := 0
 | |
| 	mut x := c
 | |
| 	if (x & 240) != 0 {
 | |
| 		// 0xF0
 | |
| 		x >>= 4
 | |
| 	} else {
 | |
| 		b += 4
 | |
| 	}
 | |
| 	if (x & 12) != 0 {
 | |
| 		// 0x0C
 | |
| 		x >>= 2
 | |
| 	} else {
 | |
| 		b += 2
 | |
| 	}
 | |
| 	if (x & 2) == 0 {
 | |
| 		// 0x02
 | |
| 		b++
 | |
| 	}
 | |
| 	return b
 | |
| }
 | |
| 
 | |
| // Calculate string length for in number of codepoints
 | |
| pub fn utf8_str_len(s string) int {
 | |
| 	mut l := 0
 | |
| 	mut i := 0
 | |
| 	for i < s.len {
 | |
| 		l++
 | |
| 		i += ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1
 | |
| 	}
 | |
| 	return l
 | |
| }
 | |
| 
 | |
| // Calculate string length for formatting, i.e. number of "characters"
 | |
| // This is simplified implementation. if you need specification compliant width,
 | |
| // use utf8.east_asian.display_width.
 | |
| pub fn utf8_str_visible_length(s string) int {
 | |
| 	mut l := 0
 | |
| 	mut ul := 1
 | |
| 	for i := 0; i < s.len; i += ul {
 | |
| 		c := unsafe { s.str[i] }
 | |
| 		ul = ((0xe5000000 >> ((unsafe { s.str[i] } >> 3) & 0x1e)) & 3) + 1
 | |
| 		if i + ul > s.len { // incomplete UTF-8 sequence
 | |
| 			return l
 | |
| 		}
 | |
| 		l++
 | |
| 		// avoid the match if not needed
 | |
| 		if ul == 1 {
 | |
| 			continue
 | |
| 		}
 | |
| 		// recognize combining characters and wide characters
 | |
| 		match ul {
 | |
| 			2 {
 | |
| 				r := u64((u16(c) << 8) | unsafe { s.str[i + 1] })
 | |
| 				if r >= 0xcc80 && r < 0xcdb0 {
 | |
| 					// diacritical marks
 | |
| 					l--
 | |
| 				}
 | |
| 			}
 | |
| 			3 {
 | |
| 				r := u64((u32(c) << 16) | unsafe { (u32(s.str[i + 1]) << 8) | s.str[i + 2] })
 | |
| 				// diacritical marks extended
 | |
| 				// diacritical marks supplement
 | |
| 				// diacritical marks for symbols
 | |
| 				if (r >= 0xe1aab0 && r <= 0xe1ac7f)
 | |
| 					|| (r >= 0xe1b780 && r <= 0xe1b87f)
 | |
| 					|| (r >= 0xe28390 && r <= 0xe2847f)
 | |
| 					|| (r >= 0xefb8a0 && r <= 0xefb8af) {
 | |
| 					// diacritical marks
 | |
| 					l--
 | |
| 				}
 | |
| 				// Hangru
 | |
| 				// CJK Unified Ideographics
 | |
| 				// Hangru
 | |
| 				// CJK
 | |
| 				else if (r >= 0xe18480 && r <= 0xe1859f)
 | |
| 					|| (r >= 0xe2ba80 && r <= 0xe2bf95)
 | |
| 					|| (r >= 0xe38080 && r <= 0xe4b77f)
 | |
| 					|| (r >= 0xe4b880 && r <= 0xea807f)
 | |
| 					|| (r >= 0xeaa5a0 && r <= 0xeaa79f)
 | |
| 					|| (r >= 0xeab080 && r <= 0xed9eaf)
 | |
| 					|| (r >= 0xefa480 && r <= 0xefac7f)
 | |
| 					|| (r >= 0xefb8b8 && r <= 0xefb9af) {
 | |
| 					// half marks
 | |
| 					l++
 | |
| 				}
 | |
| 			}
 | |
| 			4 {
 | |
| 				r := u64((u32(c) << 24) | unsafe {
 | |
| 					(u32(s.str[i + 1]) << 16) | (u32(s.str[i + 2]) << 8) | s.str[i + 3]
 | |
| 				})
 | |
| 				// Enclosed Ideographic Supplement
 | |
| 				// Emoji
 | |
| 				// CJK Unified Ideographs Extension B-G
 | |
| 				if (r >= 0x0f9f8880 && r <= 0xf09f8a8f)
 | |
| 					|| (r >= 0xf09f8c80 && r <= 0xf09f9c90)
 | |
| 					|| (r >= 0xf09fa490 && r <= 0xf09fa7af)
 | |
| 					|| (r >= 0xf0a08080 && r <= 0xf180807f) {
 | |
| 					l++
 | |
| 				}
 | |
| 			}
 | |
| 			else {}
 | |
| 		}
 | |
| 	}
 | |
| 	return l
 | |
| }
 |