OpenVMS Source-Code Demos

UNICODE_TO_UTF8

	function string unicode_to_utf8(long uni%())				!
	!==================================================================================================
	! title  : unicode_to_utf8.fun
	! history:
	! ver who when   what
	! --- --- ------ ----------------------------------------------------------------------------------
	! 100 NSR 170315 1. original effort
	!==================================================================================================
	! UTF-8 encoding
	! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
	! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points in
	!			the 21-bit address space are not being used (notice the 'z' on line 4))
	!
	! UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
	! -------------------	-----------------------------				---------
	! 0000,0000-0000,007F	0xxxxxxx						 7 bits
	! 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
	! 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
	! 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
	! 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
	! 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
	!==================================================================================================
	option type=explicit							!
	!
	declare string	out$, temp$						!
	declare long	uni%, temp%, alt%, i%, j%, k%, bytes%, bits%		!
	!-----------------------------------------------------------------------
	!	main
	!-----------------------------------------------------------------------
	out$ = ""								!
	k% = uni%(0)								! data length is stored here
	for i% = 1 to k%							! scan the string
	    uni% = uni%(i%)							! grab some unicode
	    select uni%								!
		case 0		to x"0007F"					!
			bytes% = 1						!
		case x"00080"	to x"007FF"					!
			bytes% = 2						!
		case x"00800"	to x"0FFFF"					!
			bytes% = 3						!
		case x"10000"	to x"10FFFF"					!
			bytes% = 4						!
		case else							!
		     goto next_code_point					! throw away anything else
	    end select								!
	    !
	    temp$ = ""								! zap
	    temp% = bytes%							! copy desired bytes
	    while temp% > 0							!
		if temp% = 1 then						! if on last one
		    select bytes%						!
			case 1							!
			    bits% = uni%					! no encoding required
			case 2							!
			    bits% = b"11000000" or uni%				!
			case 3							!
			    bits% = b"11100000" or uni%				!
			case 4							!
			    bits% = b"11110000" or uni%				!
		    end select							!
		else								! else not on last one
		    bits% = b"10000000" or (uni% and x"3f")			! only use the lowest 6-bits
		    uni% = uni% / 64						! shift data by six bits
		end if								!
		temp$ = chr$(bits%) + temp$					! append from the left
		temp% = temp% -1						!
	    next
	    out$ = out$ + temp$
	    !
	    next_code_point:
	next i%									!
	unicode_to_utf8 = out$							! pass string back
	end function								! adios
	!