This repository has been archived on 2024-04-08. You can view files and clone it, but cannot push or open issues or pull requests.
deb-mbse/lib/charconv_utf.c
2001-08-17 05:46:24 +00:00

268 lines
10 KiB
C

/*****************************************************************************
*
* File ..................: charconv_utf.c
* Purpose ...............: Common utilities
* Last modification date : 29-Aug-2000
*
*****************************************************************************
* Copyright (C) 1997-2000
*
* Michiel Broek FIDO: 2:280/2802
* Beekmansbos 10
* 1971 BV IJmuiden
* the Netherlands
*
* This file is part of MBSE BBS.
*
* This BBS is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2, or (at your option) any
* later version.
*
* MBSE BBS is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with MBSE BBS; see the file COPYING. If not, write to the Free
* Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*****************************************************************************/
#include "libs.h"
#include "structs.h"
#include "common.h"
char Base_64Code[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
/* returns numeric value from a Base64Code[] digit */
static int index_hex2[128] = {
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1,0x3e, -1, -1, -1,0x3f,
0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,
0x3c,0x3d, -1, -1, -1, -1, -1, -1,
-1,0x00,0x01,0x02,0x03,0x04,0x05,0x06,
0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,
0x17,0x18,0x19, -1, -1, -1, -1, -1,
-1,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,
0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,
0x31,0x32,0x33, -1, -1, -1, -1, -1
};
void utf7_to_eight(char *in,char **out,int *code)
{
int isb64,l_code=CHRS_AUTODETECT;
char *p, *q, *buf;
buf=malloc(strlen(in)*sizeof(char));
isb64=0;
for (p = in, q = buf; *p != '\0';) {
if (isb64) { /* we are in B64 encoding, that is in utf-7 */
int bit_buffer=0;
int nbits=0;
int i,l,result,offset=0;
/* find the lenght of the B64 string */
l=strspn(p,Base_64Code);
for (i=0;i<l;i++) {
bit_buffer <<= 6;
bit_buffer |= index_hex2[(unsigned int)*p++];
nbits += 6;
if (nbits >= 8) {
nbits -= 8;
result = ((bit_buffer >> nbits)&0xff);
/* if the charset code is unknown try to find it.
* it only works for latin1 (iso-8859-1), cyrillic, greek,
* arabic and hebrew (iso-8859-[5678]), as for other latin
* encodings it is harder, iso-8859-2 is assumed as it is
* the most common
*/
if ((l_code==CHRS_AUTODETECT) || (l_code==CHRS_ISO_8859_1)) {
if (result == 0x00) l_code=CHRS_ISO_8859_1;
else if (result == 0x01) l_code=CHRS_ISO_8859_2;
else if (result == 0x03) l_code=CHRS_ISO_8859_7;
else if (result == 0x04) l_code=CHRS_ISO_8859_5;
else if (result == 0x05) l_code=CHRS_ISO_8859_8;
else if (result == 0x06) l_code=CHRS_ISO_8859_6;
}
/* what to add to next byte to convert to iso-8859-*
* note that it doesn't work for iso-8859-{2,3,4,9,10}
* as the offset changes for almost each char
*/
if (result == 0x00) offset=0x00;
else if (result == 0x03) offset=0x30;
else if (result == 0x04) offset=0xa0;
else if (result == 0x05) offset=0x10;
else if (result == 0x06) offset=0xa0;
/* convert to the right 8bit char by adding offset */
if (result < 0x06) *q++ = (char)((bit_buffer & 0xff) + offset);
else *q++ = (char)(bit_buffer & 0xff);
}
}
/* end of B64 encoding */
if (*p == '-') p++;
isb64=0;
} else if (*p == '+') { /* '+' is the beginning of a new B64 section */
isb64=1;
p++;
} else { /* ascii encoding */
*q++=*p++;
}
}
*q = '\0';
/* now we know the 8bit charset that was encoded whith utf-7,
* so ask again to see if a conversion to FTN charset is needed
*/
if (*code==CHRS_AUTODETECT || *code==CHRS_NOTSET)
*code=getoutcode(l_code);
switch (l_code) {
case CHRS_ISO_8859_1 :
case CHRS_ISO_8859_15:
switch (*code) {
case CHRS_CP437 : eight2eight(buf,out,(char *)ISO_8859_1__CP437); break;
case CHRS_CP850 : eight2eight(buf,out,(char *)ISO_8859_1__CP850); break;
case CHRS_MACINTOSH : eight2eight(buf,out,(char *)ISO_8859_1__MACINTOSH); break;
default : noconv(buf,out); break;
}
break;
case CHRS_ISO_8859_5 :
switch (*code) {
case CHRS_CP866 : eight2eight(buf,out,(char *)ISO_8859_5__CP866); break;
case CHRS_KOI8_R :
case CHRS_KOI8_U : eight2eight(buf,out,(char *)ISO_8859_5__KOI8); break;
case CHRS_MIK_CYR : eight2eight(buf,out,(char *)ISO_8859_5__MIK_CYR); break;
default : noconv(buf,out); break;
}
break;
case CHRS_ISO_8859_8 :
switch (*code) {
case CHRS_CP424 : eight2eight(buf,out,(char *)ISO_8859_8__CP424); break;
case CHRS_CP862 : eight2eight(buf,out,(char *)ISO_8859_8__CP862); break;
default : noconv(buf,out); break;
}
break;
default : noconv(in,out); break;
}
}
/*
* UNICODE UTF-8
* ------------- ------------------------------------
* 0000 -> 007F = 7 bits = 0xxxxxxx
* 0080 -> 07FF = 11 bits = 110xxxxx 10xxxxxx
* 0800 -> FFFF = 16 bits = 1110xxxx 10xxxxxx 10xxxxxx
*/
void utf8_to_eight(char *in,char **out,int *code)
{
int is8bit,l_code=CHRS_AUTODETECT;
char *p, *q, *buf;
buf=malloc(strlen(in)*sizeof(char));
is8bit=0;
for (p = in, q = buf; *p != '\0';) {
int bit_buffer=0;
int nbits=0;
int result,offset=0;
if ((*p & 0xff) >= 0xe0) { /* 16 bits = 1110xxxx 10xxxxxx 10xxxxxx */
bit_buffer=((*p++ & 0xff) & 0x0f);
bit_buffer=(bit_buffer << 4);
bit_buffer+=((*p++ & 0xff) & 0xbf);
bit_buffer=(bit_buffer << 6);
bit_buffer+=((*p++ & 0xff) & 0x3f);
nbits=16;
} else if ((*p & 0xff) >= 0xc0) { /* 11 bits = 110xxxxx 10xxxxxx */
bit_buffer=((*p++ & 0xff) & 0x2f);
bit_buffer=(bit_buffer << 6);
bit_buffer+=((*p++ & 0xff) & 0x3f);
nbits=11;
} else { /* 7 bits = 0xxxxxxx */
bit_buffer=(*p++ & 0xff);
nbits=7;
}
if (nbits >= 8) {
result = ((bit_buffer >> 8)&0xff);
/* if the charset code is unknown try to find it.
* it only works for latin1 (iso-8859-1), cyrillic, greek,
* arabic and hebrew (iso-8859-[5678]), as for other latin
* encodings it is harder, iso-8859-2 is assumed as it is
* the most common
*/
if ((l_code==CHRS_AUTODETECT) || (l_code==CHRS_ISO_8859_1)) {
if (result == 0x00) l_code=CHRS_ISO_8859_1;
else if (result == 0x01) l_code=CHRS_ISO_8859_2;
else if (result == 0x03) l_code=CHRS_ISO_8859_7;
else if (result == 0x04) l_code=CHRS_ISO_8859_5;
else if (result == 0x05) l_code=CHRS_ISO_8859_8;
else if (result == 0x06) l_code=CHRS_ISO_8859_6;
}
/* what to add to next byte to convert to iso-8859-*
* note that it doesn't work for iso-8859-{2,3,4,9,10}
* as the offset changes for almost each char
*/
if (result == 0x00) offset=0x00;
else if (result == 0x03) offset=0x30;
else if (result == 0x04) offset=0xa0;
else if (result == 0x05) offset=0x10;
else if (result == 0x06) offset=0xa0;
/* convert to the right 8bit char by adding offset */
if (result < 0x06) *q++ = (char)((bit_buffer & 0xff) + offset);
else *q++ = (char)(bit_buffer & 0xff);
} else { /* ascii encoding */
*q++ = (char)(bit_buffer & 0xff);
}
}
*q = '\0';
/* now we know the 8bit charset that was encoded whith utf-7,
* so ask again to see if a conversion to FTN charset is needed
*/
if (*code==CHRS_AUTODETECT || *code==CHRS_NOTSET)
*code=getoutcode(l_code);
switch (l_code) {
case CHRS_ISO_8859_1 :
case CHRS_ISO_8859_15:
switch (*code) {
case CHRS_CP437 : eight2eight(buf,out,(char *)ISO_8859_1__CP437); break;
case CHRS_CP850 : eight2eight(buf,out,(char *)ISO_8859_1__CP850); break;
case CHRS_MACINTOSH : eight2eight(buf,out,(char *)ISO_8859_1__MACINTOSH); break;
default : noconv(buf,out); break;
}
break;
case CHRS_ISO_8859_5 :
switch (*code) {
case CHRS_CP866 : eight2eight(buf,out,(char *)ISO_8859_5__CP866); break;
case CHRS_KOI8_R :
case CHRS_KOI8_U : eight2eight(buf,out,(char *)ISO_8859_5__KOI8); break;
case CHRS_MIK_CYR : eight2eight(buf,out,(char *)ISO_8859_5__MIK_CYR); break;
default : noconv(buf,out); break;
}
break;
case CHRS_ISO_8859_8 :
switch (*code) {
case CHRS_CP424 : eight2eight(buf,out,(char *)ISO_8859_8__CP424); break;
case CHRS_CP862 : eight2eight(buf,out,(char *)ISO_8859_8__CP862); break;
default : noconv(buf,out); break;
}
break;
default : noconv(in,out); break;
}
}