OpenVMS Source-Code Demos

INTERNATIONALIZATION_DEMO_101

//==================================================================================================
// title  : internationalization_demo_101.c
// author : Neil Rieck
// created: 2016-03-09
// purpose: a playground to trial a few ideas
// notes  : 1)"I18N" means "InternationalizatioN" (18 chars between I and N of "InternationalizatioN")
//	    2) "L10N" means "LocalizatioN" (10 chars between L and N of "LocalizatioN")
//	    3) version 100 should run properly on any OpenVMS/VMS system
//	    4) version 101 will only run properly if optional kit "VMSI18N" is installed
//
// ver who when     what
// --- --- -------- --------------------------------------------------------------------------------
// 100 NSR 20160310 1. original effort
//     NSR 20160311 2. the saga continues
// 101 NSR 20160311 1. changes to demo conversions to UTF-8
//==================================================================================================
#include <stdio.h>
#include <locale.h>
#include <string.h>
#include <wchar.h>
#include <iconv.h>
#include <errno.h>
//
char		*locale;
char		fromcodeset[30];
char		tocodeset[30];
int		iconv_opened;
iconv_t		iconv_struct;
char		buffer0[32767];
char		buffer1[32767];
int		iconv_status;
unsigned int	buf0len;
unsigned int	buf1len;
char		*buf0ptr;
char		*buf1ptr;
//
//	main()
//
int main(int argc, char **argv){
    printf("-i-Program: %s\n",argv[0]);
    //===========================================================================
    //	fun with locale
    //===========================================================================
    // 1. folder SYS$I18N_LOCALE will usually contain 2 files
    // 2. installing optional kit "VMSI18N" will increase this number to 97
    //---------------------------------------------------------------------------
    locale = setlocale(LC_ALL,"");						// probe our current setting
    if (locale == NULL) {
	printf("-e-could not determine the locale\n");
    }else{
	printf("-i-current locale: %s\n", locale);				// usually will display "C"
    }
    //===========================================================================
    //	fun with iconv (international conversion)
    //===========================================================================
    // 1. This routine will look for, and load, this file:
    //		SYS$I18N_ICONV:ISO8859-1-EURO_UTF-8.ICONV
    //					      +++++- to
    //					     +------ separator
    //			       ++++++++++++++------- from
    // 2. Folder SYS$I18N_ICONV will usually contain 10 files (none related to UTF-8)
    // 3. Installing optional kit "VMSI18N" will increase this number to 173 (46 related to UTF-8)
    // 4. These character sets (from 2010) are now quite old as far as the internet is concerned. For example,
    //    in the real world we see Windows-1252 (also known as ANSI) supporting 32 characters not supported in
    //    ISO-8859-1 (the first char being "Euro" at position 0x80). The HTML5 specification specifies that all
    //    browsers expecting ISO-8859-1 must be prepared to handle windows-1252/ANSI so this hack will determine
    //    if file "ISO8859-1-EURO_UTF-8.ICONV" properly handles windows-1252ANSI symbols
    //---------------------------------------------------------------------------
    sprintf(fromcodeset,"ISO8859-1-EURO");					// one byte characters
    sprintf(tocodeset  ,"UTF-8");						// variable byte characters
    iconv_opened = FALSE;
    if ((iconv_struct = iconv_open(tocodeset,fromcodeset)) == (iconv_t) - 1){
	switch(errno){
	case EMFILE:
	case ENFILE:
	    printf("-e-too many conversion files to open\n");
	    break;
	case ENOMEM:
	    printf("-e-not enough memory\n");
	    break;
	case EINVAL:
	    printf("-e-unsupported Conversion\n");
	    break;
	default:
	    printf("-e-unexpected error (%ld) from iconv_open()\n",errno);
	}
    }else{
	iconv_opened = TRUE;
	printf("-i-iconv_open was sucessful\n");
    }
    //
    //	convert a string or two
    //
    if (iconv_opened) {
	sprintf(buffer0,"Test with copyright %c and Euro %c and TM %c",0xa9, 0x80, 0x99);
	//
	// warnings:
	// 1) "ISO8859-1-EURO_UTF-8.ICONV" is not working properly (or at least not like windows-1252/ansi as I had hoped)
	//
	//	char	windows-1250	unicode		expected utf-8	resultant utf-8
	//		------------	-------		--------------	---------------
	//	Euro	0x80		0x20ac		e2 82 ac 	c2 80 (wrong)
	//	TM	0x99		0x2122		e2 84 a2	c2 99 (wrong)
	//
	//	Why is this wrong. UTF-8 is a unicode encoding rather than a character set (although rfc3629 restricts
	//	unicode to 23 bits with one noticable hole; "unicode 8" only supports 120,737 code points)
	//
	// 2) so we could send this output hoping for the best, or...
	// 3) use iconv() to convert ISO8859-1-EURO to UCS-2, then
	//    write some code to remap 27 of 32 characters to unicode, then
	//    use iconv() to convert to UCS-2 to UTF-8
	//
	buf0len = strlen(buffer0);					// need the size of the data to convert
	buf1len = sizeof(buffer1);					// need the size of the data buffer available
	buf0ptr = (char*) &buffer0;					// iconv requires a pointer to a pointer
	buf1ptr = (char*) &buffer1;					//	ditto
	unsigned long buf0len_b4, buf1len_b4;				// hacking (because enquiring minds want to know)
	buf0len_b4	= buf0len;					// hacking; copy data for hacking purposes
	buf1len_b4	= buf1len;					// 	ditto
	//
	// size_t iconv (iconv_t cd, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft);
	//
	iconv_status = iconv (iconv_struct, &buf0ptr, &buf0len, &buf1ptr, &buf1len);
	printf("-i-iconv_status: %ld\n", iconv_status);			//
	printf("-i-buf0len     : %ld -> %ld\n", buf0len_b4, buf0len);	//
	printf("-i-buf1len     : %ld -> %ld\n", buf1len_b4, buf1len);	//
	for (int i=0; i<(buf1len_b4-buf1len); i++){			// peek at our data buffer
	    printf("-i-position: %5x data: %2x\n",i, (unsigned char) buffer1[i]);
	}
    }
    //
    //	time to exit
    //	but do we need to do all this? Perhaps just a blind call to iconv_close(iconv_struct)
    //
    if (iconv_opened) {
	if (iconv_close(iconv_struct) == -1) {
	    switch(errno){
	    case EBADF:
		printf("-e-conversion descriptor is invalid\n");
		break;
	    default:
		printf("-e-unexpected error (%ld) from iconv_close()\n",errno);
		break;
	    }
	}
    }
    //---------------------------------------------------------------------------
    printf("-i-exit\n");
    return 1;
}