OpenVMS Source-Code Demos

ISO_8859_TO_ASCII

1000	%title "iso-8859-to-ascii-xxx.bas"					!
	%ident                      "Version_103.1"				! <<<---+
	declare string constant	k_version = "103.1"			,	! <<<---+					&
				k_program = "iso-8859-to-ascii"			!
	!=========================================================================================================================
	! title  : iso-8859-to-ascii-xxx.BAS
	! author : Neil Rieck (http://www3.sympatico.ca/n.rieck/) (n.rieck@sympatico.ca)
	! created: 2001-08-22
	! History:
	! ver who when   what
	! --- --- ------ --------------------------------------------------------------------------------------------------------
	! 100 NSR 020822 1. original stub (for use else where)
	! 101 NSR 081211 1. started work on integrating a unicode translator
	! 102 NSR 121009 1. bug fix in unicode decoder
	! 103 NSR 121009 1. more work in the test cases
	!========================================================================================================================
	option type=explicit							! no kid stuff
	set no prompt								! no ? on input
	!
	declare string constant	htab	= '9'C			,		! horizontal tab		&
				null	= '0'C			,		! null				&
				alpha	= "0123456789ABCDEF"			!
	!
	declare string	iso_translate$						,&
			input_data$						,&
			temp$							,&
			output_data$						,&
			choice$							,&
			junk$							,&
		long	i%							,&
			debug%							,&
			c1							,&
			c2							,&
			column
	!
	external string function unicode_to_iso( string )			!
	external string function long_to_hex( long, long )			!
	!=======================================================================
	!	<<< init >>>
	!=======================================================================
	init:
	gosub init_iso_translator						!
	!
	!	load sample data array
	!
	dim string u$(99)							!
	dim string d$(99)
	!
	for i% = 0 to 32
	    c2 = i% + 32
	    junk$ = chr$(X"c0"L) + chr$(128% or c2)
	    u$(i%) = junk$
	    d$(i%) = long_to_hex(c2,1)
	next i%
	!
	u$(33)= u$( 1) + u$( 1) + u$( 1) + u$( 1)				!
	u$(34)= u$( 2) + u$( 2) + u$( 2) + u$( 2)				!
	u$(35)= u$( 3) + u$( 3) + u$( 3) + u$( 3)				!
	u$(36)= u$( 4) + u$( 4) + u$( 4) + u$( 4)				!
	!
	junk$ = ""								!
	for i% = 0 to 32							!
	    c2 = i% + 32							!
	    junk$ = junk$ + chr$(X"c0"L) + chr$(128% or c2)			!
	next i%									!
	u$(37)= junk$								!
	!=======================================================================
	!	<<< main >>>
	!=======================================================================
	main:
	print k_program +"_"+ k_version						!
	print string$(len(k_program +"_"+ k_version), asc("="))			! underline previous line (how will this optimize?)
	!
	print "test-data menu: "
	column = 0
	for i% = 0 to 32
	    print ">";
	    print using "<0>#"; i%;
	    print " hex=";d$(i%);
	    print " utf=";u$(i%);" ";
	    column = column + 1
	    if column = 6 then
		column = 0
		print
	    end if
	next i%
	print
	for i% = 33 to 37
	    print ">";
	    print using "##"; i%;
	    print " utf=";u$(i%)
	next i%
	print "choice? (0-37, or p/aste data string) ";
	input choice$								!
	select choice$								!
	    case "0" to "99"							!
		temp$ = u$( integer(choice$ ))					!
	    case else								!
		input "paste iso-8859 data> "; input_data$			! web data enters here
		temp$ = edit$( input_data$, 128+16+8)				! no trailing, compress, no leading
	end select								!
	temp$	= unicode_to_iso(temp$)						!
	output_data$ = xlate$( temp$, iso_translate$)				! translate ISO -> ASCII
	print "-i- 7-bit ASCII: "+ output_data$					!
	print "==============================================="
	goto fini								!
	!
	!========================================================================================
	!
	!	<<< init ISO-8859-1 character translator >>>
	!
	!	build an ASCII translation table
	!	notes:	1. remember that <NUL> is in position #1 of iso_translate$
	!		2. this routine converts some 8-bit characters into 7-bit via translation
	!========================================================================================
	init_iso_translator:
	!
	iso_translate$ = ""							! init
	!
    %let %paranoid = 1%								! paranoid filtering	(lowest 14 become <null>)
    %if  %paranoid = 0%								!
    %then									! normal filtering	(lowest 14 are as-is)
	for i% = 0 to 13							! build 7-bit table ( from <NUL> to <CR> )
	    iso_translate$ = iso_translate$ + chr$(i%)				! use these as-is
	next i%									!
    %else									! paranoid filtering	(lowest 14 become <null>)
	for i% = 0 to 13							! build 7-bit table ( from <NUL> to <CR> )
	    iso_translate$ = iso_translate$ + null				! change these to <null>
	next i%									!
	mid$( iso_translate$,  9 +1, 1) = htab					! restore our delimiter
	mid$( iso_translate$, 13 +1, 1) = " "					! convert <cr> to <sp>
    %end %if									!
	for i% = 14 to 31							! build 7-bit table ( from <SO> to <US> )
	    iso_translate$ = iso_translate$ + null				! change these to <null>
	next i%									!
	for i% = 32 to 127							! build 7-bit table
            iso_translate$ = iso_translate$ + chr$(i%)				! use these as-is
	next i%									!
	!
	!	now patch the lower ascii translation table as required by your application
	!
	mid$( iso_translate$, 96%  +1, 1) = "'"					! convert "`" to "'"
!~~~	mid$( iso_translate$, 123% +1, 1) = " "					x map "{"   to <sp>
!~~~	mid$( iso_translate$, 124% +1, 1) = " "					x map "}"   to <sp>
!~~~	mid$( iso_translate$, 125% +1, 1) = " "					x map "|"   to <sp>
!~~~	mid$( iso_translate$, 126% +1, 1) = null				x map "~"   to <null>
	mid$( iso_translate$, 127% +1, 1) = null				! map <DEL> to <null>
	!
	!	this area contains the second half of the 8 bit character set
	!
	for i% = 128 to 159							!
	    iso_translate$ = iso_translate$ + null				! change these to <null>
	next i%									!
	for i% = 160 to 255							!
	    iso_translate$ = iso_translate$ + " "				! change these to <sp>
	next i%									!
	!
	!	now patch the upper ascii translation table (a subjective translation of ISO-8859-1)
	!
	mid$( iso_translate$, 160 +1, 1) = " "					! non-break space
	mid$( iso_translate$, 161 +1, 1) = "!"					! inverted exclamation
	mid$( iso_translate$, 162 +1, 1) = "c"					! cents sign
	mid$( iso_translate$, 163 +1, 1) = "$"					! pound sign
	mid$( iso_translate$, 164 +1, 1) = "$"					! currency sign
	mid$( iso_translate$, 165 +1, 1) = "$"					! yen sign
	!
	mid$( iso_translate$, 169 +1, 1) = "c"					! copyright sign
	!
	mid$( iso_translate$, 171 +1, 1) = "<"					! left angle quotes
	!
	mid$( iso_translate$, 174 +1, 1) = "r"					! registered trademark
	!
	mid$( iso_translate$, 177 +1, 1) = "+"					! plus-minus sign
	!
	mid$( iso_translate$, 180 +1, 1) = "'"					! spacing acute
	mid$( iso_translate$, 181 +1, 1) = "u"					! mirco sign
	!
	mid$( iso_translate$, 183 +1, 1) = "."					! middle dot
	mid$( iso_translate$, 184 +1, 1) = ","					! spacing cedilla
	!
	mid$( iso_translate$, 187 +1, 1) = ">"					! right angle quotes
	!
	mid$( iso_translate$, 191 +1, 1) = "?"					! inverted question mark
	mid$( iso_translate$, 192 +1, 1) = "A"					! A	grave
	mid$( iso_translate$, 193 +1, 1) = "A"					!	acute
	mid$( iso_translate$, 194 +1, 1) = "A"					!	circumflex
	mid$( iso_translate$, 195 +1, 1) = "A"					!	tilde
	mid$( iso_translate$, 196 +1, 1) = "A"					!	diaresis
	mid$( iso_translate$, 197 +1, 1) = "A"					!	ring
	mid$( iso_translate$, 198 +1, 1) = "A"					!	ligature
	mid$( iso_translate$, 199 +1, 1) = "C"					! C	cedilla
	mid$( iso_translate$, 200 +1, 1) = "E"					! E	grave
	mid$( iso_translate$, 201 +1, 1) = "E"					!	acute
	mid$( iso_translate$, 202 +1, 1) = "E"					!	circumflex
	mid$( iso_translate$, 203 +1, 1) = "E"					!	diaresis
	mid$( iso_translate$, 204 +1, 1) = "I"					! I	grave
	mid$( iso_translate$, 205 +1, 1) = "I"					!	acute
	mid$( iso_translate$, 206 +1, 1) = "I"					!	circumflex
	mid$( iso_translate$, 207 +1, 1) = "I"					!	diaresis
	mid$( iso_translate$, 208 +1, 1) = "D"					! ETH
	mid$( iso_translate$, 209 +1, 1) = "N"					! N	tilde
	mid$( iso_translate$, 210 +1, 1) = "O"					! O	grave
	mid$( iso_translate$, 211 +1, 1) = "O"					!	acute
	mid$( iso_translate$, 212 +1, 1) = "O"					!	circumflex
	mid$( iso_translate$, 213 +1, 1) = "O"					!	tilde
	mid$( iso_translate$, 214 +1, 1) = "O"					!	diaresis
	mid$( iso_translate$, 215 +1, 1) = "x"					! Multiply Sign
	mid$( iso_translate$, 216 +1, 1) = "O"					! O	slash
	mid$( iso_translate$, 217 +1, 1) = "U"					! U	grave
	mid$( iso_translate$, 218 +1, 1) = "U"					!	acute
	mid$( iso_translate$, 219 +1, 1) = "U"					!	circumflex
	mid$( iso_translate$, 220 +1, 1) = "U"					!	diaresis
	mid$( iso_translate$, 221 +1, 1) = "Y"					! Y	acute
	mid$( iso_translate$, 222 +1, 1) = "p"					! thorn
	mid$( iso_translate$, 223 +1, 1) = "B"					! sharp s
	mid$( iso_translate$, 224 +1, 1) = "a"					! a	grave
	mid$( iso_translate$, 225 +1, 1) = "a"					!	acute
	mid$( iso_translate$, 226 +1, 1) = "a"					!	circumflex
	mid$( iso_translate$, 227 +1, 1) = "a"					!	tilde
	mid$( iso_translate$, 228 +1, 1) = "a"					!	diaeresis
	mid$( iso_translate$, 229 +1, 1) = "a"					!	ring
	mid$( iso_translate$, 230 +1, 1) = "a"					!	ligature
	mid$( iso_translate$, 231 +1, 1) = "c"					! c	cedilla
	mid$( iso_translate$, 232 +1, 1) = "e"					! e	grave
	mid$( iso_translate$, 233 +1, 1) = "e"					!	acute
	mid$( iso_translate$, 234 +1, 1) = "e"					!	circumflex
	mid$( iso_translate$, 235 +1, 1) = "e"					!	diaeresis
	mid$( iso_translate$, 236 +1, 1) = "i"					! i	grave
	mid$( iso_translate$, 237 +1, 1) = "i"					!	acute
	mid$( iso_translate$, 238 +1, 1) = "i"					!	circumflex
	mid$( iso_translate$, 239 +1, 1) = "i"					!	diaeresis
	mid$( iso_translate$, 240 +1, 1) = "o"					! eth
	mid$( iso_translate$, 241 +1, 1) = "n"					! n	tilde
	mid$( iso_translate$, 242 +1, 1) = "o"					! o	grave
	mid$( iso_translate$, 243 +1, 1) = "o"					!	acute
	mid$( iso_translate$, 244 +1, 1) = "o"					!	circumflex
	mid$( iso_translate$, 245 +1, 1) = "o"					!	tilde
	mid$( iso_translate$, 246 +1, 1) = "o"					!	diaeresis
	mid$( iso_translate$, 247 +1, 1) = "/"					! division sign
	mid$( iso_translate$, 248 +1, 1) = "o"					! o	slash
	mid$( iso_translate$, 249 +1, 1) = "u"					! u	grave
	mid$( iso_translate$, 250 +1, 1) = "u"					!	acute
	mid$( iso_translate$, 251 +1, 1) = "u"					!	circumflex
	mid$( iso_translate$, 252 +1, 1) = "u"					!	diaeresis
	mid$( iso_translate$, 253 +1, 1) = "y"					! y	acute
	mid$( iso_translate$, 254 +1, 1) = "b"					! thorn
	mid$( iso_translate$, 255 +1, 1) = "y"					! y	diaeresis
	return									!
	!=======================================================================
	!	<<< adios... >>>
	!=======================================================================
31000	fini:									!
	end									!
	!
	!########################################################################################################################
	!
	!======================================================================================
	! Title	: unicode -> iso
	! note	: UTF-8 encoding (see RFC 2279) http://www.faqs.org/rfcs/rfc2279.html
	!
	!   UCS-4 range (hex.)    UTF-8 octet sequence (binary)
	!   -------------------   -----------------------------
	!   0000 0000-0000 007F   0xxxxxxx							 		 7-data bits
	!   0000 0080-0000 07FF   110xxxxx 10xxxxxx						 0xCx-0xDx	11-data bits
	!   0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx					 0xEx		16-data bits
	!   0001 0000-001F FFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx				 0xFx		21-data bits
	!   0020 0000-03FF FFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx			 0xFx		26-data bits
	!   0400 0000-7FFF FFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 0xFx		31-data bits
	!=======================================================================================
32000	function string unicode_to_iso( string inbound$ )
	option type=explicit
	declare string	cpy$,				&
			temp$,				&
		long	i%, j%, k%, z%, debug%
	!
	cpy$ = inbound$								! copy original data
	for i% = 1 to len(cpy$)							! scan the string
	    j% = asc(mid$(cpy$,i%,1))						! test current character
	    !
	    !	7-bit test
	    !
	    if (j% and B"10000000"L) = 0 then					!
		goto next_character						! leave "as-is"
	    end if								!
	    !
	    !	2-character test
	    !
	    if (j% and B"11100000"L) = 192 then					! keep 3 highest bits for group test
		!
		!	2-character transformations
		!	byte-1   byte-2
		!	110xxxxx 10xxxxxx
		!
		j% = j% and B"11111"L						! keep lower 5 bits for processing
		j% = j% * 64%							! shift left by 6 places (prep for merge)
		k% = asc(mid$(cpy$,i%+1,1))					! grab next character
		if (k% and B"11000000"L) <> 128 then				! must only be 10xxxxxx
			print "-e-unicode sanity error"				!
			print "-i-character: "+str$(i%+1)			!
			goto next_character					!
		end if								!
		k% = k% and B"111111"L						! keep lower 6 bits for processing
		z% = j% or k%							! merge the bits
		mid$(cpy$,i%  ,1) = chr$((z% and B"111100000000"L)/256%)	! process and write back to char+0
		mid$(cpy$,i%+1,1) = chr$( z% and B"11111111"L)	    		! process and write back to char+1
		i% = i% + 1							! fix pointer
	    end if								!
	    !
	    !	3-character test
	    !
	    if (j% and B"11110000"L) = 224 then					! keep 4 highest bits for group test
	    end if								!
	    !
	    !	4-character test
	    !
	    if (j% and B"11111000"L) = 240 then					! keep 5 highest bits for group test
	    end if								!
	    !
	    !	5-character test
	    !
	    if (j% and B"11111100"L) = 248 then					! keep 6 highest bits for group test
	    end if								!
	    !
	    !	7-character test
	    !
	    if (j% and B"11111110"L) = 252 then					! keep 6 highest bits for group test
	    end if								!
	    !
	    next_character:
	next i%									!
	!
	cpy$ = edit$(cpy$, 4)							! remove control characters
	if cpy$ <> inbound$  then						!
	    print "==============================================="		!
	    print "-i- UTF-8      : "+ inbound$					!
	    print "-i- ISO-8859-1 : "+ cpy$					!
	    print "==============================================="		!
	end if									!
	!
	function_exit:								!
	unicode_to_iso = cpy$							!
	end function								!
	!
	!========================================================================
	!	long_to_hex()
	!	entry:	inbound%	(data to convert)
	!		size%		(size in bytes)
	!========================================================================
32010	function string long_to_hex( long inbound%, long size% )
	option type=explicit
	declare string constant	alpha	= "0123456789ABCDEF"			!
	declare long	i%, z%, temp%						!
	declare string	junk$
	!
	temp% = inbound%							!
	for i%= 1 to (size% * 2)						!
	    z% = (temp% and 15%)						! isolate last nibble
	    junk$ = mid$(alpha,z%+1,1) + junk$					!
	    temp% = temp% / 16%							! shift by four bits
	next i%
	!
	function_exit:								!
	long_to_hex = junk$							!
	end function								!
	!========================================================================