searchtag # rextended ucs2utf8
I have completed the function
Without using tables, converting all UCS-2 (unicode 2 bytes entry point) characters to UTF-8β¦
:global UCS2toUTF8 do={
:local numbyte2hex do={
:local input [:tonum $1]
:local hexchars "0123456789ABCDEF"
:local convert [:pick $hexchars (($input >> 4) & 0xF)]
:set convert ($convert.[:pick $hexchars ($input & 0xF)])
:return $convert
}
:local charsString ""
:for x from=0 to=15 step=1 do={ :for y from=0 to=15 step=1 do={
:local tmpHex "$[:pick "0123456789ABCDEF" $x ($x+1)]$[:pick "0123456789ABCDEF" $y ($y+1)]"
:set $charsString "$charsString$[[:parse "(\"\\$tmpHex\")"]]"
} }
:local chr2int do={:if (($1="") or ([:len $1] > 1) or ([:typeof $1] = "nothing")) do={:return -1}; :return [:find $2 $1 -1]}
:local string $1
:if (([:typeof $string] != "str") or ($string = "")) do={ :return "" }
:local output ""
:local lenstr [:len $string]
:for pos from=0 to=($lenstr - 1) step=2 do={
:local input (([$chr2int [:pick $string $pos ($pos + 1)] $charsString] * 0x100) + \
([$chr2int [:pick $string ($pos + 1) ($pos + 2)] $charsString] ))
:local results [:toarray ""]
:local utf ""
:if ($input > 0x7F) do={
:if ($input > 0x7FF) do={
:if ($input > 0xFFFF) do={
:if ($input > 0x10FFFF) do={
:error "UTF-8 do not have code point > of 0x10FFFF"
} else={
:error "UCS-2 do not have code point > of 0xFFFF"
# the following commented lines are not used on UCS-2
# but I have already prepared my script for future changes to work with all UNICODE code points from 0x000000 to 0x10FFFF as well...
# :set ($results->0) (0xF0 + ( $input >> 18 ))
# :set ($results->1) (0x80 + (($input >> 12) & 0x3F))
# :set ($results->2) (0x80 + (($input >> 6) & 0x3F))
# :set ($results->3) (0x80 + ( $input & 0x3F))
}
} else={
:set ($results->0) (0xE0 + ( $input >> 12 ))
:set ($results->1) (0x80 + (($input >> 6) & 0x3F))
:set ($results->2) (0x80 + ( $input & 0x3F))
}
} else={
:set ($results->0) (0xC0 + ($input >> 6))
:set ($results->1) (0x80 + ($input & 0x3F))
}
} else={
:set ($results->0) $input
}
:foreach item in=$results do={
:set utf "$utf%$[$numbyte2hex $item]"
}
:set output "$output$utf"
}
:return $output
}
{
:local ucsreadedfromsms β\00h\00e\00l\00l\00o\00\20\00m\00y\00\20\00f\00r\00i\00e\00n\00d\00\20\00c\00a\00m\00i\00\F3\00n\00\20\00\D1\00\F1β
:put [$UCS2toUTF8 $ucsreadedfromsms]
}
results:
%68%65%6C%6C%6F%20%6D%79%20%66%72%69%65%6E%64%20%63%61%6D%69%C3%B3%6E%20%C3%91%C3%B1
The string on example is the converted string βhello my friend camiΓ³n ΓΓ±β to UCS-2
Entry points: Γ³ = 00 FE, Γ = 00 D1,Γ± = 00 F1
For test the results:
https://www.urldecoder.org/
EDIT: Reformatted, fixed for non CP1252 characters.