OpenVMS Source-Code Demos

MYSQL_IMPORT_HELPER_BASIC_TEMPLATE_106_PART3

	!
	! start of template 3/3
	!33333333333333333333333333333333333333333333333333333333333333333333333
	!=======================================================================
	! file   : mysql_import_helper_basic_template_106_part3.bas
	! author : Neil Rieck
	! history:
	! 100 NSR 140708 original effort
	! 101 NSR 150106 added code to debug a corrupt datafile
	! 102 NSR 160226 documentation changes in utf-8 area
	! 103 NSR 170314 changes to the unicode routine
	! 106 NSR 170314 adding true unicode support
	!     NSR 170315 more work
	!     NSR 170316 appended two new external functions
	!=======================================================================
		count_o% = count_o% + 1						!
	    next								!
	use									!
	    handler_error% = err						! oops
	end when								!
	select handler_error%							!
	    case 11								!
		print "-i-";							!
	    case else								!
		print "-e-";							!
	end select								!
	print "status:"+str$(handler_error%)					!
	print "-i-finish: "+ date4$(0) +" "+ time$(0)				!
	print "-i-records read   : "+str$(count_i%)				!
	print "-i-records written: "+str$(count_o%)				!
	if handler_error% <> 11 then						!
	    when error in							!
		print "-w-additional information"; bel				!
		print "   VMS-STATUS: ";vmsstatus				!
		print "   RMS-STATUS: ";RMSSTATUS( 100%, STATUS )		!
		print "   RMS-VALUE : ";RMSSTATUS( 100%, VALUE )		!
		sleep 1								!
	    use									!
	    end when								!
	end if									!
	print "-i-closing channels"						!
	when error in								!
	    close #100								!
	    close #99								!
	use									!
	end when								!
	print "-i-program exit -------------------------"			!
32000	end									! <<<---***
	!#######################################################################
	!	external functions
	!#######################################################################
	!
	!	cleanup the data before writing to csv
	!
32100	function string cleanup(string inbound$, long requested_datatype%)	!
	option type=explicit							!
	external string function unicode_to_iso(string)				!
	external long	function mixed_to_unicode(string,long dim())		!					ver_106.2
	external string	function unicode_to_utf8(long dim())			!					ver_106.2
	declare  string	junk$							!
	declare  long	char_count%						!
	declare  long   unicode%(511)						! 511 4-byte wide characters per line
	!-----------------------------------------------------------------------
	!	main
	!-----------------------------------------------------------------------
	junk$ = edit$(inbound$, 128+16+8)					! trailing, compress, leading (DO NOT upcase)
	!
	!    There are two ways import a backslash
	!    1) use this clause in your import statement: FIELDS ESCAPED BY ''
	!	because escape is defaulted to "\"
	!    2) remove the backsashes from the data
	!
!	declare long i%, j%					x
!	j% = len(junk$)						x
!	for i% = 1 to j%					x
!	    if  mid$(junk$,i%,1) = "\" then			x
!		mid$(junk$,i%,1) = "/"				x
!	    end if						x
!	end if							x
	!
	select requested_datatype%				!
	case 0							! asis (do nothing)
	case 1							! cp-1252/windows-1252/iso-8859-1
	    junk$ = unicode_to_iso(junk$)			!
	case 2							! utf8
	    char_count% = mixed_to_unicode(junk$,unicode%())	!
	    unicode%(0) = char_count%				!
	    junk$  = unicode_to_utf8(unicode%()) 		!
	end select						!
	!
	cleanup = junk$						!
	end function						!
	!
32110	function string unicode_to_iso( string src$     )	!
	!==================================================================================================
	! title  : UNICODE_TO_ISO_106.FUN
	! caveat1: This function is misnamed. It should have called "utf8_to_cp1252" or "mixed_to_cp1252"
	! purpose: Scan inbound data looking for legal UTF-8 code sequences. These are converted to unicode
	!	   which is then mapped to cp1252 (also known as Windows-1252; also known as ANSI) which is
	!	   a superset of ISO-8859-1
	! caveat2: There are two ways to do this: Strict and Relaxed
	!	   1) Strict : everything above ASCII 127 must be legal UTF-8 or we throw it away
	!	   2) Relaxed:
	!	      2a. anything above ASCII 127 which is not legal UTF-8 is assumed to be cp1252 so we keep it
	!	      2b. any unicode value above 255 must be mapped back to the equivalent cp1252 code if possible
	!		  otherwise we throw it away (because we only have room for one single byte)
	! history:
	! ver who when   what
	! --- --- ------ ----------------------------------------------------------------------------------
	! 105 DGM 150401 1. renovation
	! 106 NSR 160314 1. renovation (to match utf8_decode.c)
	!     NSR 170315 2. minor cleanup + documentation changes
	!==================================================================================================
	! UTF-8 encoding
	! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
	! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
	!	in the 21-bit address space are being used (notice the 'z' on line 4))
	!
	! UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
	! -------------------	-----------------------------				---------
	! 0000,0000-0000,007F	0xxxxxxx						 7 bits
	! 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
	! 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
	! 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
	! 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
	! 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
	!==================================================================================================
	option type=explicit							!
	!
	declare string	dst$, tst$, alt$					!
	declare long	uni%, tst%, alt%, i%, j%, bytes%			!
	!-----------------------------------------------------------------------
	!	main
	!-----------------------------------------------------------------------
	for i% = 1 to len(src$)							! scan the string
	    tst$ = mid$(src$, i%, 1)						! isolate tst character
	    tst% = asc(tst$)							! convert to ascii
	    if tst% <= 127 then							!
		dst$ = dst$ + tst$						! copy ASCII as-is
		goto get_next_char						! next iteration
	    end if								!
	    !
	    if (tst% and X"e0") = x"c0" then					! test for: 1100-xxxx
		bytes%	= 2							! this might be a 2-byte sequence (or not)
		uni%	= tst% and x"1f"					! keep 5-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f0") = x"e0" then					! test for: 1110-xxxx
		bytes%	= 3							! this might be a 3-byte sequence (or not)
		uni%	= tst% and x"0f"					! keep 4-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f8") = x"f0" then					! test for: 1111-xxxx
		bytes%	= 4							! this might be a 4-byte sequence (or not)
		uni%	= tst% and x"07"					! keep 3-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    !
	    !	definately not unicode
	    !
	    dst$ = dst$ + tst$							! copy cp1252 as-is
	    goto get_next_char							! next iteration
	    !
	    !	this might be a unicode character depending upon the following bytes following
	    !	entry:	i%	= points to tmp$ (first utf-8 octet)
	    !		bytes%	= expected total number of octects (2-4)
	    !
	    process_uni:							!
	    for j% = 1 to (bytes%-1)						!
		alt$ = mid$(src$, i%+j%, 1)					! isolate character after tst$
		alt% = asc(alt$)						!
		if (alt% and x"c0") = x"80"					! is this a secondary utf-8 octet? (10xx-xxxx)
		then								! yes
		    alt% = (alt% and x"7f")					! isolate 6-bits
		    uni% = uni% * 64%						! shift by 6 places
		    uni% = uni% + alt%						! merge bits
		else								! no
		    dst$ = dst$ + tst$						! assume is cp1252 then copy as-is
		    goto get_next_char						!
		end if								!
	    next j%								!
	    !
	    !	at this point, this appears to be a legal UTF-8 sequence
	    !
	    !-------------------------------------------------------------------
	    !	unicode mapping to cp1252
	    !	note: perhaps a single table lookup would be more efficient
	    !-------------------------------------------------------------------
	    !	unicode ref: http://unicode-table.com/en/
	    !
	    select uni%							!
		case <= 255						!
				dst$ = dst$ + chr$(uni%)		!
		case 8208, 8209						! x2010,x2011	Unicode: hyphen, non-breaking hyphen
				dst$ = dst$ + chr$(45)			!		ASCII  : dash
		case 8210, 8211						! x2012,x2013	Unicode: Figure Dash, En Dash
				dst$ = dst$ + chr$(150)			!		Windows: dash
		case 8212, 8213						! x2014,x2015	Unicode: EM dash, Horizontal Bar
				dst$ = dst$ + chr$(151)			!		Windows: dash
		case 8216						! x2018		Unicode: left side single quote
				dst$ = dst$ + chr$(145)			!		Windows: left single quote
		case 8217						! x2019		Unicode: right side single quote
				dst$ = dst$ + chr$(146)			!		Windows: right single quote
		case 8220						! x201c		Unicode: left double-quote
				dst$ = dst$ + chr$(147)			!		Windows: left double-quote
		case 8221						! x201d		Unicode: right double-quote
				dst$ = dst$ + chr$(148)			!		Windows" right double-quote
		case 8224						! x2020		Unicode: dagger
				dst$ = dst$ + chr$(134)			!		Windows: dagger
		case 8225						! x2021		Unicode: double dagger
				dst$ = dst$ + chr$(135)			!		Windows: double dagger
		case 8226						! x2022		Unicode: black dot
				dst$ = dst$ + chr$(149)			!		Windows: black dot
		case 8230						! x2026		Unicode: three dot horizontal ellipsis
				dst$ = dst$ + chr$(133)			!		Windows: horizontal ellipsis
		case 8248						! x2038		Unicode: caret
				dst$ = dst$ + chr$(94)			!		Windows: circumflex
		case 8249						! x2039		Unicode: single less than
				dst$ = dst$ + chr$(139)			!		Windows: single less than
		case 8250						! x203a		unicode: single greater than
				dst$ = dst$ + chr$(155)			!		Windows: single greater than
		case 8364						! x20ac		Unicode: Euro symbol
				dst$ = dst$ + chr$(128)			!		Windows: Euro symbol
		case else						! oops, what do we do now?
%if 1=1 %then								!
				dst$ = dst$ + chr$(182)			! substitute	Windows: Pilcrow (funny inverted "P")
%else									!
									! do nothing (throw it way)
%end %if								!
	    end select							!
	    !-------------------------------------------------------------------
	    i% = i% + bytes% - 1						! eat some chars (NEXT will eat one more)
	    get_next_char:							!
	next i%									! advance by tst
	unicode_to_iso = dst$							! pass back to called
	end function								! adios
	!
32120	function long mixed_to_unicode(string inbound$, long uni%())		!
	!==================================================================================================
	! title  : mixed_to_unicode.fun
	! purpose: Scan inbound data looking for legal UTF-8 code sequences. These are converted to unicode
	!	   which is then mapped to cp1252 (also known as Windows-1252; also known as ANSI) which is
	!	   a superset of ISO-8859-1
	! caveat2: There are two ways to do this: Strict and Relaxed
	!	   1) Strict : everything above ASCII 127 must be legal UTF-8 or we throw it away
	!	   2) Relaxed: anything above ASCII 127 which is not legal UTF-8 is assumed to be cp1252 so
	!		must be mapped to unicode or thrown away
	! history:
	! ver who when   what
	! --- --- ------ ----------------------------------------------------------------------------------
	! 100 NSR 170315 1. original effort (derived from the misnamed function: UNICODE_TO_ISO_106.FUN)
	!==================================================================================================
	! UTF-8 encoding
	! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
	! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points
	!	in the 21-bit address space are being used (notice the 'z' on line 4))
	!
	! UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
	! -------------------	-----------------------------				---------
	! 0000,0000-0000,007F	0xxxxxxx						 7 bits
	! 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
	! 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
	! 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
	! 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
	! 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
	!==================================================================================================
	option type=explicit							!
	!
	declare string	tst$, alt$, src$					!
	declare long	uni%, tst%, alt%, i%, j%, k%, bytes%, count_out%	!
	!-----------------------------------------------------------------------
	!	main
	!-----------------------------------------------------------------------
	k% = len(inbound$)							! measure the length of inbound
	src$ = inbound$ + space$(6)						! tack on 6-spaces for end-of-string processing
	for i% = 1 to k%							! scan the string
	    tst$ = mid$(src$, i%, 1)						! isolate tst character
	    tst% = asc(tst$)							! convert to ascii
	    if tst% <= 127 then							!
		count_out% = count_out% + 1					!
		uni%(count_out%) = tst%						! store this value
		goto get_next_char						! next iteration
	    end if								!
	    !
	    if (tst% and X"e0") = x"c0" then					! test for: 110x-xxxx
		bytes%	= 2							! this might be a 2-byte sequence (or not)
		uni%	= tst% and x"1f"					! keep 5-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f0") = x"e0" then					! test for: 1110-xxxx
		bytes%	= 3							! this might be a 3-byte sequence (or not)
		uni%	= tst% and x"0f"					! keep 4-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    if (tst% and X"f8") = x"f0" then					! test for: 1111-0xxx
		bytes%	= 4							! this might be a 4-byte sequence (or not)
		uni%	= tst% and x"07"					! keep 3-bits of octet #1
		goto process_uni						! continue below
	    end if								!
	    !
	    !	definately not unicode
	    !
	    select tst%								! test the original code
		case 128
			uni% = x'20ac'
		case 129
			uni% = 0
		case 130
			uni% = x'201a'
		case 131
			uni% = x'0192'
		case 132
			uni% = x'201e'
		case 133
			uni% = x'2026'
		case 134
			uni% = x'2020'
		case 135
			uni% = x'2021'
		case 136
			uni% = x'02c6'
		case 137
			uni% = x'2030'
		case 138
			uni% = x'0160'
		case 139
			uni% = x'2039'
		case 140
			uni% = x'0152'
		case 141
			uni% = 0
		case 142
			uni% = x'017d'
		case 143
			uni% = 0
		case 144
			uni% = 0
		case 145
			uni% = x'2018'
		case 146
			uni% = x'2019'
		case 147
			uni% = x'201c'
		case 148
			uni% = x'201d'
		case 149
			uni% = x'2022'
		case 150
			uni% = x'2013'
		case 151
			uni% = x'2014'
		case 152
			uni% = x'02dc'
		case 153
			uni% = x'2122'
		case 154
			uni% = x'0161'
		case 155
			uni% = x'203a'
		case 156
			uni% = x'0153'
		case 157
			uni% = 0
		case 158
			uni% = x'017e'
		case 159
			uni% = x'0178'
		case else
			uni% = tst%
	    end select								!
	    if uni% > 0 then							!
		count_out% = count_out% + 1					!
		uni%(count_out%) = uni%						! store this value
	    end if								!
	    goto get_next_char							! next iteration
	    !
	    !	might be unicode depending upon the following bytes
	    !	entry:	i%	= points to tmp$ (first utf-8 octet)
	    !		bytes%	= expected total number of octects (2-4)
	    !
	    process_uni:							!
	    for j% = 1 to (bytes%-1)						!
		alt$ = mid$(src$, i%+j%, 1)					! isolate character after tst$
		alt% = asc(alt$)						!
		if (alt% and x"c0") = x"80"					! is this a secondary utf-8 octet? (10xx-xxxx)
		then								! yes
		    alt% = (alt% and x"3f")					! isolate 6-bits
		    uni% = uni% * 64%						! shift this by 6 places
		    uni% = uni% + alt%						! merge bits
		else								! no
		    count_out% = count_out% + 1					!
		    uni%(count_out%) = tst%					! store original first byte as-is
		    goto get_next_char						!
		end if								!
	    next j%								!
	    !
	    count_out% = count_out% + 1						!
	    uni%(count_out%) = uni%						! is legal UTF-8 so store unicode
	    !-------------------------------------------------------------------
	    i% = i% + bytes% - 1						! eat some chars (NEXT will eat one more)
	    get_next_char:							!
	next i%									! advance by tst
	uni%(0) = count_out%							!
	mixed_to_unicode = count_out%						! pass back to called
	end function								! adios
	!
32130	function string unicode_to_utf8(long uni%())				!
	!==================================================================================================
	! title  : unicode_to_utf8.fun
	! history:
	! ver who when   what
	! --- --- ------ ----------------------------------------------------------------------------------
	! 100 NSR 170315 1. original effort
	!==================================================================================================
	! UTF-8 encoding
	! 1. RFC-2279: http://www.faqs.org/rfcs/rfc2279.html
	! 2. RFC-3629: https://tools.ietf.org/html/rfc3629 (limits UTF-8 to 4 octets; some code points in
	!			the 21-bit address space are not being used (notice the 'z' on line 4))
	!
	! UCS-4 range (hex)	UTF-8 octet sequence (binary)				Data Bits
	! -------------------	-----------------------------				---------
	! 0000,0000-0000,007F	0xxxxxxx						 7 bits
	! 0000,0080-0000,07FF	110xxxxx 10xxxxxx					11 bits
	! 0000,0800-0000,FFFF	1110xxxx 10xxxxxx 10xxxxxx				16 bits
	! 0001,0000-001F,FFFF	11110zXX 10xxxxxx 10xxxxxx 10xxxxxx			21 bits (RFC limit)
	! 0020,0000-03FF,FFFF	111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx		26 bits (invalid)
	! 0400,0000-7FFF,FFFF	1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx	31 bits (invalid)
	!==================================================================================================
	option type=explicit							!
	!
	declare string	out$, temp$						!
	declare long	uni%, temp%, alt%, i%, j%, k%, bytes%, bits%		!
	!-----------------------------------------------------------------------
	!	main
	!-----------------------------------------------------------------------
	out$ = ""								!
	k% = uni%(0)								! data length is stored here
	for i% = 1 to k%							! scan the string
	    uni% = uni%(i%)							! grab some unicode
	    select uni%								!
		case 0		to x"0007F"					!
			bytes% = 1						!
		case x"00080"	to x"007FF"					!
			bytes% = 2						!
		case x"00800"	to x"0FFFF"					!
			bytes% = 3						!
		case x"10000"	to x"10FFFF"					!
			bytes% = 4						!
		case else							!
		     goto next_code_point					! throw away anything else
	    end select								!
	    !
	    temp$ = ""								! zap
	    temp% = bytes%							! copy desired bytes
	    while temp% > 0							!
		if temp% = 1 then						! if on last one
		    select bytes%						!
			case 1							!
			    bits% = uni%					! no encoding required
			case 2							!
			    bits% = b"11000000" or uni%				!
			case 3							!
			    bits% = b"11100000" or uni%				!
			case 4							!
			    bits% = b"11110000" or uni%				!
		    end select							!
		else								! else not on last one
		    bits% = b"10000000" or (uni% and x"3f")			! only use the lowest 6-bits
		    uni% = uni% / 64						! shift data by six bits
		end if								!
		temp$ = chr$(bits%) + temp$					! append from the left
		temp% = temp% -1						!
	    next
	    out$ = out$ + temp$
	    !
	    next_code_point:
	next i%									!
	unicode_to_utf8 = out$							! pass string back
	end function								! adios
	!
	!33333333333333333333333333333333333333333333333333333333333333333333333
	! end of template 3/3
	!