268 lines
10 KiB
C
268 lines
10 KiB
C
/*****************************************************************************
|
|
*
|
|
* File ..................: charconv_utf.c
|
|
* Purpose ...............: Common utilities
|
|
* Last modification date : 29-Aug-2000
|
|
*
|
|
*****************************************************************************
|
|
* Copyright (C) 1997-2000
|
|
*
|
|
* Michiel Broek FIDO: 2:280/2802
|
|
* Beekmansbos 10
|
|
* 1971 BV IJmuiden
|
|
* the Netherlands
|
|
*
|
|
* This file is part of MBSE BBS.
|
|
*
|
|
* This BBS is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License as published by the
|
|
* Free Software Foundation; either version 2, or (at your option) any
|
|
* later version.
|
|
*
|
|
* MBSE BBS is distributed in the hope that it will be useful, but
|
|
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with MBSE BBS; see the file COPYING. If not, write to the Free
|
|
* Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
*****************************************************************************/
|
|
|
|
#include "libs.h"
|
|
#include "structs.h"
|
|
#include "common.h"
|
|
|
|
|
|
char Base_64Code[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
|
|
|
|
|
|
/* returns numeric value from a Base64Code[] digit */
|
|
static int index_hex2[128] = {
|
|
-1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1, -1, -1, -1, -1, -1,
|
|
-1, -1, -1,0x3e, -1, -1, -1,0x3f,
|
|
0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,
|
|
0x3c,0x3d, -1, -1, -1, -1, -1, -1,
|
|
-1,0x00,0x01,0x02,0x03,0x04,0x05,0x06,
|
|
0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,
|
|
0x0f,0x10,0x11,0x12,0x13,0x14,0x15,0x16,
|
|
0x17,0x18,0x19, -1, -1, -1, -1, -1,
|
|
-1,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,0x20,
|
|
0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,
|
|
0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,0x30,
|
|
0x31,0x32,0x33, -1, -1, -1, -1, -1
|
|
};
|
|
|
|
|
|
|
|
void utf7_to_eight(char *in,char **out,int *code)
|
|
{
|
|
int isb64,l_code=CHRS_AUTODETECT;
|
|
char *p, *q, *buf;
|
|
|
|
buf=malloc(strlen(in)*sizeof(char));
|
|
|
|
isb64=0;
|
|
for (p = in, q = buf; *p != '\0';) {
|
|
|
|
if (isb64) { /* we are in B64 encoding, that is in utf-7 */
|
|
int bit_buffer=0;
|
|
int nbits=0;
|
|
int i,l,result,offset=0;
|
|
|
|
/* find the lenght of the B64 string */
|
|
l=strspn(p,Base_64Code);
|
|
for (i=0;i<l;i++) {
|
|
bit_buffer <<= 6;
|
|
bit_buffer |= index_hex2[(unsigned int)*p++];
|
|
nbits += 6;
|
|
if (nbits >= 8) {
|
|
nbits -= 8;
|
|
result = ((bit_buffer >> nbits)&0xff);
|
|
/* if the charset code is unknown try to find it.
|
|
* it only works for latin1 (iso-8859-1), cyrillic, greek,
|
|
* arabic and hebrew (iso-8859-[5678]), as for other latin
|
|
* encodings it is harder, iso-8859-2 is assumed as it is
|
|
* the most common
|
|
*/
|
|
if ((l_code==CHRS_AUTODETECT) || (l_code==CHRS_ISO_8859_1)) {
|
|
if (result == 0x00) l_code=CHRS_ISO_8859_1;
|
|
else if (result == 0x01) l_code=CHRS_ISO_8859_2;
|
|
else if (result == 0x03) l_code=CHRS_ISO_8859_7;
|
|
else if (result == 0x04) l_code=CHRS_ISO_8859_5;
|
|
else if (result == 0x05) l_code=CHRS_ISO_8859_8;
|
|
else if (result == 0x06) l_code=CHRS_ISO_8859_6;
|
|
}
|
|
/* what to add to next byte to convert to iso-8859-*
|
|
* note that it doesn't work for iso-8859-{2,3,4,9,10}
|
|
* as the offset changes for almost each char
|
|
*/
|
|
if (result == 0x00) offset=0x00;
|
|
else if (result == 0x03) offset=0x30;
|
|
else if (result == 0x04) offset=0xa0;
|
|
else if (result == 0x05) offset=0x10;
|
|
else if (result == 0x06) offset=0xa0;
|
|
|
|
/* convert to the right 8bit char by adding offset */
|
|
if (result < 0x06) *q++ = (char)((bit_buffer & 0xff) + offset);
|
|
else *q++ = (char)(bit_buffer & 0xff);
|
|
}
|
|
}
|
|
/* end of B64 encoding */
|
|
if (*p == '-') p++;
|
|
isb64=0;
|
|
} else if (*p == '+') { /* '+' is the beginning of a new B64 section */
|
|
isb64=1;
|
|
p++;
|
|
} else { /* ascii encoding */
|
|
*q++=*p++;
|
|
}
|
|
}
|
|
*q = '\0';
|
|
|
|
/* now we know the 8bit charset that was encoded whith utf-7,
|
|
* so ask again to see if a conversion to FTN charset is needed
|
|
*/
|
|
if (*code==CHRS_AUTODETECT || *code==CHRS_NOTSET)
|
|
*code=getoutcode(l_code);
|
|
switch (l_code) {
|
|
case CHRS_ISO_8859_1 :
|
|
case CHRS_ISO_8859_15:
|
|
switch (*code) {
|
|
case CHRS_CP437 : eight2eight(buf,out,(char *)ISO_8859_1__CP437); break;
|
|
case CHRS_CP850 : eight2eight(buf,out,(char *)ISO_8859_1__CP850); break;
|
|
case CHRS_MACINTOSH : eight2eight(buf,out,(char *)ISO_8859_1__MACINTOSH); break;
|
|
default : noconv(buf,out); break;
|
|
}
|
|
break;
|
|
case CHRS_ISO_8859_5 :
|
|
switch (*code) {
|
|
case CHRS_CP866 : eight2eight(buf,out,(char *)ISO_8859_5__CP866); break;
|
|
case CHRS_KOI8_R :
|
|
case CHRS_KOI8_U : eight2eight(buf,out,(char *)ISO_8859_5__KOI8); break;
|
|
case CHRS_MIK_CYR : eight2eight(buf,out,(char *)ISO_8859_5__MIK_CYR); break;
|
|
default : noconv(buf,out); break;
|
|
}
|
|
break;
|
|
case CHRS_ISO_8859_8 :
|
|
switch (*code) {
|
|
case CHRS_CP424 : eight2eight(buf,out,(char *)ISO_8859_8__CP424); break;
|
|
case CHRS_CP862 : eight2eight(buf,out,(char *)ISO_8859_8__CP862); break;
|
|
default : noconv(buf,out); break;
|
|
}
|
|
break;
|
|
default : noconv(in,out); break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* UNICODE UTF-8
|
|
* ------------- ------------------------------------
|
|
* 0000 -> 007F = 7 bits = 0xxxxxxx
|
|
* 0080 -> 07FF = 11 bits = 110xxxxx 10xxxxxx
|
|
* 0800 -> FFFF = 16 bits = 1110xxxx 10xxxxxx 10xxxxxx
|
|
*/
|
|
void utf8_to_eight(char *in,char **out,int *code)
|
|
{
|
|
int is8bit,l_code=CHRS_AUTODETECT;
|
|
char *p, *q, *buf;
|
|
|
|
buf=malloc(strlen(in)*sizeof(char));
|
|
|
|
is8bit=0;
|
|
for (p = in, q = buf; *p != '\0';) {
|
|
int bit_buffer=0;
|
|
int nbits=0;
|
|
int result,offset=0;
|
|
|
|
if ((*p & 0xff) >= 0xe0) { /* 16 bits = 1110xxxx 10xxxxxx 10xxxxxx */
|
|
bit_buffer=((*p++ & 0xff) & 0x0f);
|
|
bit_buffer=(bit_buffer << 4);
|
|
bit_buffer+=((*p++ & 0xff) & 0xbf);
|
|
bit_buffer=(bit_buffer << 6);
|
|
bit_buffer+=((*p++ & 0xff) & 0x3f);
|
|
nbits=16;
|
|
} else if ((*p & 0xff) >= 0xc0) { /* 11 bits = 110xxxxx 10xxxxxx */
|
|
bit_buffer=((*p++ & 0xff) & 0x2f);
|
|
bit_buffer=(bit_buffer << 6);
|
|
bit_buffer+=((*p++ & 0xff) & 0x3f);
|
|
nbits=11;
|
|
} else { /* 7 bits = 0xxxxxxx */
|
|
bit_buffer=(*p++ & 0xff);
|
|
nbits=7;
|
|
}
|
|
|
|
if (nbits >= 8) {
|
|
result = ((bit_buffer >> 8)&0xff);
|
|
/* if the charset code is unknown try to find it.
|
|
* it only works for latin1 (iso-8859-1), cyrillic, greek,
|
|
* arabic and hebrew (iso-8859-[5678]), as for other latin
|
|
* encodings it is harder, iso-8859-2 is assumed as it is
|
|
* the most common
|
|
*/
|
|
if ((l_code==CHRS_AUTODETECT) || (l_code==CHRS_ISO_8859_1)) {
|
|
if (result == 0x00) l_code=CHRS_ISO_8859_1;
|
|
else if (result == 0x01) l_code=CHRS_ISO_8859_2;
|
|
else if (result == 0x03) l_code=CHRS_ISO_8859_7;
|
|
else if (result == 0x04) l_code=CHRS_ISO_8859_5;
|
|
else if (result == 0x05) l_code=CHRS_ISO_8859_8;
|
|
else if (result == 0x06) l_code=CHRS_ISO_8859_6;
|
|
}
|
|
/* what to add to next byte to convert to iso-8859-*
|
|
* note that it doesn't work for iso-8859-{2,3,4,9,10}
|
|
* as the offset changes for almost each char
|
|
*/
|
|
if (result == 0x00) offset=0x00;
|
|
else if (result == 0x03) offset=0x30;
|
|
else if (result == 0x04) offset=0xa0;
|
|
else if (result == 0x05) offset=0x10;
|
|
else if (result == 0x06) offset=0xa0;
|
|
/* convert to the right 8bit char by adding offset */
|
|
if (result < 0x06) *q++ = (char)((bit_buffer & 0xff) + offset);
|
|
else *q++ = (char)(bit_buffer & 0xff);
|
|
} else { /* ascii encoding */
|
|
*q++ = (char)(bit_buffer & 0xff);
|
|
}
|
|
}
|
|
*q = '\0';
|
|
/* now we know the 8bit charset that was encoded whith utf-7,
|
|
* so ask again to see if a conversion to FTN charset is needed
|
|
*/
|
|
if (*code==CHRS_AUTODETECT || *code==CHRS_NOTSET)
|
|
*code=getoutcode(l_code);
|
|
switch (l_code) {
|
|
case CHRS_ISO_8859_1 :
|
|
case CHRS_ISO_8859_15:
|
|
switch (*code) {
|
|
case CHRS_CP437 : eight2eight(buf,out,(char *)ISO_8859_1__CP437); break;
|
|
case CHRS_CP850 : eight2eight(buf,out,(char *)ISO_8859_1__CP850); break;
|
|
case CHRS_MACINTOSH : eight2eight(buf,out,(char *)ISO_8859_1__MACINTOSH); break;
|
|
default : noconv(buf,out); break;
|
|
}
|
|
break;
|
|
case CHRS_ISO_8859_5 :
|
|
switch (*code) {
|
|
case CHRS_CP866 : eight2eight(buf,out,(char *)ISO_8859_5__CP866); break;
|
|
case CHRS_KOI8_R :
|
|
case CHRS_KOI8_U : eight2eight(buf,out,(char *)ISO_8859_5__KOI8); break;
|
|
case CHRS_MIK_CYR : eight2eight(buf,out,(char *)ISO_8859_5__MIK_CYR); break;
|
|
default : noconv(buf,out); break;
|
|
}
|
|
break;
|
|
case CHRS_ISO_8859_8 :
|
|
switch (*code) {
|
|
case CHRS_CP424 : eight2eight(buf,out,(char *)ISO_8859_8__CP424); break;
|
|
case CHRS_CP862 : eight2eight(buf,out,(char *)ISO_8859_8__CP862); break;
|
|
default : noconv(buf,out); break;
|
|
}
|
|
break;
|
|
default : noconv(in,out); break;
|
|
}
|
|
}
|
|
|
|
|