OpenVMS Source-Code Demos

MIXED_TO_UNICODE

	function long mixed_to_unicode(string inbound$, long uni%())		!
	!==================================================================================================
	! title  : mixed_to_unicode.fun
	! purpose: Scan inbound data looking for legal UTF-8 code sequences. These are converted to unicode
	!	   which is then mapped to cp1252 (also known as Windows-1252; also known as ANSI) which is
	!	   a superset of ISO-8859-1
	! caveat2: There are two ways to do this: Strict and Relaxed
	!	   1) Strict : everything above ASCII 127 must be legal UTF-8 or we throw it away
	!	   2) Relaxed: anything above ASCII 127 which is not legal UTF-8 is assumed to be cp1252 so
	!		must be mapped to unicode or thrown away
	! history:
	! ver who when   what
	! --- --- ------ ----------------------------------------------------------------------------------
	! 100 NSR 170315 1. original effort (derived from the misnamed function: UNICODE_TO_ISO_106.FUN)
	!==================================================================================================
	! UTF-8 encoding
	! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
	! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
	!	in the 21-bit address space are being used (notice the 'z' on line 4))
	!
	! UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
	! -------------------	-----------------------------				---------
	! 0000,0000-0000,007F	0xxxxxxx						 7 bits
	! 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
	! 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
	! 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
	! 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
	! 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
	!==================================================================================================
	option type=explicit							!
	!
	declare string	tst$, alt$, src$					!
	declare long	uni%, tst%, alt%, i%, j%, k%, bytes%, count_out%	!
	!-----------------------------------------------------------------------
	!	main
	!-----------------------------------------------------------------------
	k% = len(inbound$)							! measure the length of inbound
	src$ = inbound$ + space$(6)						! tack on 6-spaces for end-of-string processing
	for i% = 1 to k%							! scan the string
	    tst$ = mid$(src$, i%, 1)						! isolate tst character
	    tst% = asc(tst$)							! convert to ascii
	    if tst% <= 127 then							!
		count_out% = count_out% + 1					!
		uni%(count_out%) = tst%						! store this value
		goto get_next_char						! next iteration
	    end if								!
	    !
	    if (tst% and X"e0") = x"c0" then					! test for: 110x-xxxx
		bytes%	= 2							! this might be a 2-byte sequence (or not)
		uni%	= tst% and x"1f"					! keep 5-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f0") = x"e0" then					! test for: 1110-xxxx
		bytes%	= 3							! this might be a 3-byte sequence (or not)
		uni%	= tst% and x"0f"					! keep 4-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f8") = x"f0" then					! test for: 1111-0xxx
		bytes%	= 4							! this might be a 4-byte sequence (or not)
		uni%	= tst% and x"07"					! keep 3-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    !
	    !	definately not unicode
	    !
	    select tst%								! test the original code
		case 128
			uni% = x'20ac'
		case 129
			uni% = 0
		case 130
			uni% = x'201a'
		case 131
			uni% = x'0192'
		case 132
			uni% = x'201e'
		case 133
			uni% = x'2026'
		case 134
			uni% = x'2020'
		case 135
			uni% = x'2021'
		case 136
			uni% = x'02c6'
		case 137
			uni% = x'2030'
		case 138
			uni% = x'0160'
		case 139
			uni% = x'2039'
		case 140
			uni% = x'0152'
		case 141
			uni% = 0
		case 142
			uni% = x'017d'
		case 143
			uni% = 0
		case 144
			uni% = 0
		case 145
			uni% = x'2018'
		case 146
			uni% = x'2019'
		case 147
			uni% = x'201c'
		case 148
			uni% = x'201d'
		case 149
			uni% = x'2022'
		case 150
			uni% = x'2013'
		case 151
			uni% = x'2014'
		case 152
			uni% = x'02dc'
		case 153
			uni% = x'2122'
		case 154
			uni% = x'0161'
		case 155
			uni% = x'203a'
		case 156
			uni% = x'0153'
		case 157
			uni% = 0
		case 158
			uni% = x'017e'
		case 159
			uni% = x'0178'
		case else
			uni% = tst%
	    end select								!
	    if uni% > 0 then							!
		count_out% = count_out% + 1					!
		uni%(count_out%) = uni%						! store this value
	    end if								!
	    goto get_next_char							! next iteration
	    !
	    !	might be unicode depending upon the following bytes
	    !	entry:	i%	= points to tmp$ (first utf-8 octet)
	    !		bytes%	= expected total number of octects (2-4)
	    !
	    process_uni:							!
	    for j% = 1 to (bytes%-1)						!
		alt$ = mid$(src$, i%+j%, 1)					! isolate character after tst$
		alt% = asc(alt$)						!
		if (alt% and x"c0") = x"80"					! is this a secondary utf-8 octet? (10xx-xxxx)
		then								! yes
		    alt% = (alt% and x"3f")					! isolate 6-bits
		    uni% = uni% * 64%						! shift this by 6 places
		    uni% = uni% + alt%						! merge bits
		else								! no
		    count_out% = count_out% + 1					!
		    uni%(count_out%) = tst%					! store original first byte as-is
		    goto get_next_char						!
		end if								!
	    next j%								!
	    !
	    count_out% = count_out% + 1						!
	    uni%(count_out%) = uni%						! is legal UTF-8 so store unicode
	    !-------------------------------------------------------------------
	    i% = i% + bytes% - 1						! eat some chars (NEXT will eat one more)
	    get_next_char:							!
	next i%									! advance by tst
	uni%(0) = count_out%							!
	mixed_to_unicode = count_out%						! pass back to called
	end function								! adios
	!