deb-mbse/lib/charset.c

/*****************************************************************************
 *
 * $Id$
 * Purpose ...............: Characterset functions
 *
 *****************************************************************************
 * Copyright (C) 1997-2007
 *
 * Michiel Broek		FIDO:	2:280/2802
 * Beekmansbos 10
 * 1971 BV IJmuiden
 * the Netherlands
 *
 * This file is part of MBSE BBS.
 *
 * This BBS is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the
 * Free Software Foundation; either version 2, or (at your option) any
 * later version.
 *
 * MBSE BBS is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with MBSE BBS; see the file COPYING.  If not, write to the Free
 * Software Foundation, 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
 *****************************************************************************/

#include "../config.h"
#include "mbselib.h"


int		use_tran1 = FALSE;	/* Translate stage 1 active	*/
int		use_tran2 = FALSE;	/* Translate stage 2 active	*/
int		loglevel = '-';		/* Debug loglevel		*/
iconv_t		cd1;			/* Conversion descriptor 1	*/
iconv_t		cd2;			/* Conversion descriptor 2	*/


/*
 * Search table for CHRS: kludge to the current name.
 * This table may include obsolete names (and we need
 * them).
 */
struct _charalias charalias[] = {
    {(char *)"ASCII",           (char *)"CP437"},
    {(char *)"VT100",           (char *)"CP437"},
    {(char *)"US_ASCII",	(char *)"CP437"},
    {(char *)"LATIN",		(char *)"LATIN-1"},
    {(char *)"AMIGA",           (char *)"CP437"},
    {(char *)"IBMPC",           (char *)"CP437"},
    {(char *)"PC-8",            (char *)"CP437"},
    {(char *)"CP850",           (char *)"CP437"},
    {(char *)"MACINTOSH",       (char *)"MAC"},
    {(char *)"ALT",             (char *)"CP866"},
    {(char *)"X-ALT",           (char *)"CP866"},
    {(char *)"X-CP866",         (char *)"CP866"},
    {(char *)"+7_FIDO",         (char *)"CP866"},
    {(char *)"KOI8-U",          (char *)"KOI8-R"},
    {(char *)"IBM-878",         (char *)"KOI8-R"},
    {(char *)"CP878",           (char *)"KOI8-R"},
    {(char *)"IBM-915",         (char *)"ISO-8859-1"},
    {(char *)"X-CP1251",        (char *)"CP1251"},
    {(char *)"GBK",             (char *)"CP936"},
    {(char *)"HZ-GB-2312",      (char *)"CP936"},
    {NULL,                      NULL}
};


/*
 * Array of charset identifiers.
 * ftncidx, ftnkludge, rfcname, ic_ftn, lang, desc
 */
struct _charmap charmap[] = {
    {FTNC_NONE,   (char *)"Undef",    (char *)"Undef",      (char *)"Undef",    (char *)"C",           (char *)"Undefined"},
    {FTNC_CP437,  (char *)"CP437 2",  (char *)"ibm437",     (char *)"CP437",    (char *)"en_US",       (char *)"IBM codepage 437 (Western European) (ANSI terminal)"},
    {FTNC_CP850,  (char *)"CP850 2",  (char *)"ibm850",     (char *)"CP850",    (char *)"en_US",       (char *)"IBM codepage 850 (Latin-1) (ANSI terminal)"},
    {FTNC_CP865,  (char *)"CP865 2",  (char *)"ibm865",     (char *)"CP865",    (char *)"sv_SE",       (char *)"IBM codepage 865 (Nordic)"},
    {FTNC_CP866,  (char *)"CP866 2",  (char *)"ibm866",     (char *)"CP866",    (char *)"ru_RU",       (char *)"IBM codepage 866 (Russian)"},
    {FTNC_LATIN_1,(char *)"LATIN-1 2",(char *)"iso-8859-1", (char *)"LATIN1",   (char *)"en_US",       (char *)"ISO 8859-1 (Western European)"},
    {FTNC_LATIN_2,(char *)"LATIN-2 2",(char *)"iso-8859-2", (char *)"LATIN2",   (char *)"cs_CZ",       (char *)"ISO 8859-2 (Eastern European)"},
    {FTNC_LATIN_5,(char *)"LATIN-5 2",(char *)"iso-8859-5", (char *)"LATIN5",   (char *)"turks",       (char *)"ISO 8859-5 (Turkish)"},
    {FTNC_MAC,    (char *)"MAC",      (char *)"Macintosh",  (char *)"MACINTOSH",(char *)"en_US",       (char *)"MacIntosh character set"},
    {FTNC_CP852,  (char *)"CP852 2",  (char *)"ibm852",     (char *)"CP852",    (char *)"cs_CZ",       (char *)"IBM codepage 852 (Czech, Latin-2)"},
    {FTNC_CP895,  (char *)"CP895 2",  (char *)"cp895",      (char *)"CP850",    (char *)"cs_CZ",       (char *)"IBM codepage 895 (Czech, Kamenicky)"},
    {FTNC_KOI8_R, (char *)"KOI8-R 2", (char *)"koi8-r",     (char *)"KOI8-R",   (char *)"ru_RUi.koi8r",(char *)"Unix codepage KOI8-R (Russian)"},
    {FTNC_CP936,  (char *)"CP936 2",  (char *)"gbk",        (char *)"CP936",    (char *)"zh_CN.gbk",   (char *)"IBM codepage 936 (Chinese, GBK)"},
    {FTNC_LATIN_9,(char *)"LATIN-9 2",(char *)"iso-8859-15",(char *)"LATIN-9",  (char *)"en_US",       (char *)"ISO 8859-15 (Western European EURO)"},
    {FTNC_UTF8,   (char *)"UTF-8 4",  (char *)"utf-8",      (char *)"UTF-8",    (char *)"en_US.UTF-8", (char *)"Unicode UTF-8 (ISO/IEC 10646)"},
    {FTNC_ERROR,  NULL,               NULL,                 NULL,               NULL,                  (char *)"ERROR"}
};


/*
 * Returns index of charset or -1 if not found.
 */
int find_ftn_charset(char *ftnkludge)
{
    static int  i;
    int         j;
    char        *ftn, *cmp;

    Syslog('-', "find_ftn_charset(%s)", ftnkludge);

    ftn = calloc(80, sizeof(char));
    cmp = calloc(80, sizeof(char));

    snprintf(ftn, 80, "%s", ftnkludge);

    for (i = 0; i < strlen(ftn); i++) {
	if (ftn[i] == ' ') {
	    ftn[i] = '\0';
	    break;
	}
    }
    for (i = 0; charalias[i].alias; i++) {
	if (strcasecmp(ftn, charalias[i].alias) == 0)
	    break;
    }

    if (charalias[i].alias != NULL) {
	Syslog('-', "found alias %s", charalias[i].ftnkludge);
	snprintf(ftn, 80, "%s", charalias[i].ftnkludge);
    }

    /*
     * Now search real entry. Throw away the charset level number,
     * we don't care about that useless byte.
     */
    for (i = 0; charmap[i].ftnkludge; i++) {
	snprintf(cmp, 80, "%s", charmap[i].ftnkludge);
	for (j = 0; j < strlen(cmp); j++) {
	    if (cmp[j] == ' ') {
		cmp[j] = '\0';
		break;
	    }
	}
	if (strcasecmp(ftn, cmp) == 0)
	    break;
    }

    free(ftn);
    free(cmp);

    if (charmap[i].ftnkludge == NULL) {
	WriteError("find_ftn_charset(%s) not found", ftnkludge);
	return FTNC_ERROR;
    }

    Syslog('-', "find_ftn_charset(%s) result %d", ftnkludge, i);
    return i;
}


/*
 * Returns index of charset or -1 if not found.
 */
int find_rfc_charset(char *rfcname)
{
    static int  i;

    Syslog('-', "find_rfc_charset(%s)", rfcname);

    for (i = 0; charmap[i].rfcname; i++) {
	if (strcasecmp(rfcname, charmap[i].rfcname) == 0)
	    break;
    }

    if (charmap[i].rfcname == NULL) {
	Syslog('!', "find_rfc_charset(%s) not found", rfcname);
	return FTNC_ERROR;
    }

    Syslog('-', "find_rfc_charset(%s) result %d", rfcname, i);
    return i;
}


char *getftnchrs(int val)
{
    static char	kludge[20];

    if ((val >= FTNC_NONE) && (val <= FTNC_MAXCHARS)) {
	Syslog('-', "getftnchrs(%d) %s", val, charmap[val].ftnkludge);
	snprintf(kludge, 20, "%s", charmap[val].ftnkludge);
	return kludge;
    }

    return (char *)"LATIN-1 2";
}


char *getrfcchrs(int val)
{
    static char	rfcname[20];

    if ((val >= FTNC_NONE) && (val <= FTNC_MAXCHARS)) {
	Syslog('-', "getrfcchrs(%d) %s", val, charmap[val].rfcname);
	snprintf(rfcname, 20, "%s", charmap[val].rfcname);
	return rfcname;
    }

    return (char *)"iso-8859-1";
}


char *get_ic_ftn(int val)
{
    static char ic_ftnname[20];

    if ((val >= FTNC_NONE) && (val <= FTNC_MAXCHARS)) {
	Syslog('-', "get_ic_ftn(%d) %s", val, charmap[val].ic_ftn);
	snprintf(ic_ftnname, 20, "%s", charmap[val].ic_ftn);
	return ic_ftnname;
    }

    return (char *)"LATIN1";
}


char *get_ic_rfc(int val)
{
    static char ic_rfcname[20];

    snprintf(ic_rfcname, 19, getrfcchrs(val));
    tu(ic_rfcname);
    return ic_rfcname;
}


/*
 * Experimental table that should translate from the user selected
 * charset to a locale. This is not the right way to do, the best
 * thing is to store each bbs users locale instead and then lookup
 * his characterset using standard library calls.
 *
 * This is one of the things the bbs world never saw coming, in the
 * "good" old days bbses were almost allways called local. Thanks
 * to the internet bbs users are now all over the world.
 */
char *getlocale(int val)
{
    int		i;
    static char	langc[20];

    for (i = 0; (charmap[i].ftncidx != FTNC_ERROR); i++) {
	if (val == charmap[i].ftncidx) {
	    snprintf(langc, 20, "%s", charmap[i].lang);
	    return langc;
	}
    }

    return (char *)"C";
}


char *getchrsdesc(int val)
{
    int		i;
    static char	desc[60];

    for (i = 0; (charmap[i].ftncidx != FTNC_ERROR); i++) {
	if (val == charmap[i].ftncidx) {
	    snprintf(desc, 60, "%s", charmap[i].desc);
	    return desc;
	}
    }

    return (char *)"ERROR";
}


/*
 * Initialize charset translation. Translation can be done in 2 stages
 * with UTF-8 as the common centre because for example translate between
 * CP438 and ISO-8859-1 doesn't work directly. If translation is needed
 * with one side is UTF-8, only one stage will be used. If two the same
 * charactersets are given, the translation is off.
 * On success return 0, on error return -1 and write errorlog.
 */
int chartran_init(char *fromset, char *toset, int loglvl)
{
    loglevel = loglvl;

    if (use_tran1 || use_tran2) {
	WriteError("chartran_init() called while still open");
	chartran_close();
    }

    Syslog(loglevel, "chartran_init(%s, %s)", fromset, toset);

    if (strcmp(fromset, toset) == 0) {
	Syslog(loglevel, "nothing to translate");
	return 0;
    }

    if (strcmp(fromset, (char *)"UTF-8")) {
	cd1 = iconv_open("UTF-8", fromset);
	if (cd1 == (iconv_t)-1) {
	    WriteError("$chartran_init(%s, %s): iconv_open(UTF-8, %s) error", fromset, toset, fromset);
	    return -1;
	}
	use_tran1 = TRUE;
    }

    if (strcmp(toset, (char *)"UTF-8")) {
	cd2 = iconv_open(toset, "UTF-8");
	if (cd2 == (iconv_t)-1) {
	    WriteError("$chartran_init(%s, %s): iconv_open(%s, UTF-8) error", fromset, toset, toset);
	    if (use_tran1) {
		iconv_close(cd1);
		use_tran1 = FALSE;
	    }
	    return -1;
	}
	use_tran2 = TRUE;
    }

    return 0;
}


/*
 * Deinit active charset translation.
 */
void chartran_close(void)
{
    Syslog(loglevel, "chartran_close()");
    if (use_tran1) {
	iconv_close(cd1);
	use_tran1 = FALSE;
    }

    if (use_tran2) {
	iconv_close(cd2);
	use_tran2 = FALSE;
    }
}


/*
 * Translate a string, chartran_init must have been called to register
 * the charactersets to translate between.
 */
char *chartran(char *input)
{
    static char	outbuf[4096];
    static char	temp[4096];
    size_t	rc, inSize, outSize;
    char	*in, *out;

    memset(&outbuf, 0, sizeof(outbuf));
    memset(&temp, 0, sizeof(temp));

    /*
     * Transparant
     */
    if (!use_tran1 && !use_tran2) {
	strncpy(outbuf, input, sizeof(outbuf) -1);
	return outbuf;
    }

    /*
     * Translate to UTF-8
     */
    if (use_tran1 && !use_tran2) {
	inSize = strlen(input);
	outSize = sizeof(outbuf);
	in = input;
	out = outbuf;
	rc = iconv(cd1, &in, &inSize, &out, &outSize);
	if (rc == -1) {
	    WriteError("$iconv(%s) cd1", printable(input, 0));
	    strncpy(outbuf, input, sizeof(outbuf) -1);
	}
//	if (strcmp(input, outbuf)) {
//	    Syslog(loglevel, "i %s", printable(input, 0));
//	    Syslog(loglevel, "u %s", printable(outbuf, 0));
//	}
	return outbuf;
    }

    /*
     * Translate from UTF-8
     */
    if (!use_tran1 && use_tran2) {
	inSize = strlen(input);
	outSize = sizeof(outbuf);
	in = input;
	out = outbuf;
	rc = iconv(cd2, &in, &inSize, &out, &outSize);
	if (rc == -1) {
	    WriteError("$iconv(%s) cd2", printable(input, 0));
	    strncpy(outbuf, input, sizeof(outbuf) -1);
	}
//	if (strcmp(input, outbuf)) {
//	    Syslog(loglevel, "u %s", printable(input, 0));
//	    Syslog(loglevel, "o %s", printable(outbuf, 0));
//	}
	return outbuf;
    }

    /*
     * Double translation with UTF-8 as the middleman.
     */
    inSize = strlen(input);
    outSize = sizeof(temp);
    in = input;
    out = temp;
    rc = iconv(cd1, &in, &inSize, &out, &outSize);
    if (rc == -1) {
	WriteError("$iconv(%s) cd1", printable(input, 0));
	strncpy(outbuf, input, sizeof(outbuf) -1);
	return outbuf;
    }
//    if (strcmp(input, temp)) {
//	Syslog(loglevel, "i %s", printable(input, 0));
//    }

    inSize = strlen(temp);
    outSize = sizeof(outbuf);
    in = temp;
    out = outbuf;
    rc = iconv(cd2, &in, &inSize, &out, &outSize);
    if (rc == -1) {
	WriteError("$iconv(%s) cd2", printable(temp, 0));
	strncpy(outbuf, input, sizeof(outbuf) -1);
    }
//    if (strcmp(input, temp) || strcmp(temp, outbuf)) {
//	Syslog(loglevel, "u %s", printable(temp, 0));
//    }
//    if (strcmp(temp, outbuf)) {
//	Syslog(loglevel, "o %s", printable(outbuf, 0));
//    }

    return outbuf;
}