This repository has been archived on 2024-04-08. You can view files and clone it, but cannot push or open issues or pull requests.

421 lines
12 KiB
C++
Raw Normal View History

2002-10-26 20:08:56 +00:00
using namespace std;
#include <cctype>
#include <cstring>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <iomanip>
typedef struct {
char name[256];
int replacement[3];
} unicodechar;
typedef int charset[256];
charset from_charset, to_charset;
unicodechar unicodedata[65536], alt_unicodedata[65536];
static int atox(const char *str)
{
unsigned int result = 0;
int i = 0;
while(isxdigit(str[i]))
{
result <<= 4;
result |= isdigit(str[i]) ? (str[i] - '0') : (toupper(str[i]) - 'A' + 10);
i++;
}
return (int)result;
}
static void load_charset(const char *fn, charset &recode)
{
int chr;
for(chr = 0; chr < 128; chr++)
recode[chr] = chr;
for(chr = 128; chr < 256; chr++)
recode[chr] = 0;
ifstream file(fn);
if(file)
{
char str[1024];
while(!file.eof())
{
str[0] = '\0';
file.getline(str, sizeof(str));
if((str[0] == '\0') || (str[0] == '#'))
continue;
if(str[0] == '0')
chr = atox(str + 2);
else if(str[0] == '=')
chr = atox(str + 1);
else
{
cerr << "error in input file (string: \"" << str << '\"' << endl;
continue;
}
char *ustr = strstr(str, "U+");
if(ustr == NULL)
ustr = strstr(str, "u+");
if(ustr == NULL)
{
cerr << "error in input file (string: \"" << str << '\"' << endl;
continue;
}
recode[chr] = atox(ustr + 2);
}
}
else
{
cerr << "error opening file " << fn << endl;
}
}
static int lookup_char(int unicode_char)
{
int i;
bool found = false;
for(i = 0; i < 256; i++)
{
if(to_charset[i] == unicode_char)
{
found = true;
break;
}
}
if(!found)
{
i = -1;
}
return i;
}
static int lookup_char_by_name(const char *unicode_name)
{
int i;
bool found = false;
for(i = 0; i < 256; i++)
{
if(strcmp(unicodedata[to_charset[i]].name, unicode_name) == 0)
{
found = true;
break;
}
}
if(!found)
{
i = -1;
}
return i;
}
static char *mystrtok(char *str, char *delim)
{
static char *last = NULL;
if(str == NULL)
str = last;
if(str == NULL)
return str;
last = strpbrk(str, delim);
if(last != NULL)
{
*last = '\0';
last++;
}
return str;
}
static void load_unicodedata(const char *fn, unicodechar *udata)
{
memset(udata, 0, sizeof(unicodechar) * 65536);
ifstream file(fn);
if(file)
{
int chr;
char str[1024];
while(!file.eof())
{
str[0] = '\0';
file.getline(str, sizeof(str));
if((str[0] == '\0') || (str[0] == '#'))
continue;
char *tok = mystrtok(str, ";");
if(tok == NULL)
continue;
chr = atox(tok);
if(chr > 65536)
continue;
tok = mystrtok(NULL, ";");
if(tok == NULL)
continue;
strncpy(udata[chr].name, tok, sizeof(udata[chr].name));
udata[chr].name[sizeof(udata[chr].name) - 1] = '\0';
tok = mystrtok(NULL, ";");
if(tok == NULL)
continue;
tok = mystrtok(NULL, ";");
if(tok == NULL)
continue;
tok = mystrtok(NULL, ";");
if(tok == NULL)
continue;
tok = mystrtok(NULL, ";");
if(tok == NULL)
continue;
if((tok[0] != '\0') && !isxdigit(tok[0]))
{
tok = strchr(tok, '>');
if(tok == NULL)
{
continue;
}
while((tok[0] != '\0') && !isxdigit(tok[0]))
{
tok++;
}
}
if(tok[0] != '\0')
{
udata[chr].replacement[0] = atox(tok);
while(isxdigit(tok[0]))
{
tok++;
}
}
while((tok[0] != '\0') && !isxdigit(tok[0]))
{
tok++;
}
if(tok[0] != '\0')
{
udata[chr].replacement[1] = atox(tok);
while(isxdigit(tok[0]))
{
tok++;
}
}
while((tok[0] != '\0') && !isxdigit(tok[0]))
{
tok++;
}
if(tok[0] != '\0')
{
udata[chr].replacement[2] = atox(tok);
}
}
}
else
{
cerr << "error opening file " << fn << endl;
}
}
static void generate_table(const char *cp1, const char *cp2)
{
char incp[256], outcp[256], fn[sizeof(incp) + sizeof(outcp) + 4];
const char *slash = strrchr(cp1, '/');
const char *backslash = strrchr(cp1, '\\');
if((slash != NULL) && (backslash != NULL))
{
if(slash < backslash)
slash = backslash;
}
else if(slash == NULL)
slash = backslash;
strncpy(incp, (slash == NULL) ? cp1 : (slash + 1), sizeof(incp));
incp[sizeof(incp) - 1] = '\0';
char *dot = strchr(incp, '.');
if(dot != NULL)
dot[0] = '\0';
slash = strrchr(cp2, '/');
backslash = strrchr(cp2, '\\');
if((slash != NULL) && (backslash != NULL))
{
if(slash < backslash)
slash = backslash;
}
else if(slash == NULL)
slash = backslash;
strncpy(outcp, (slash == NULL) ? cp2 : (slash + 1), sizeof(outcp));
outcp[sizeof(outcp) - 1] = '\0';
dot = strchr(outcp, '.');
if(dot != NULL)
dot[0] = '\0';
strcpy(fn, incp);
strcat(fn, "_");
strcat(fn, outcp);
strcat(fn, ".chs");
ofstream file(fn);
if(file)
{
int chr = 0;
file << ";" << endl
<< "; This file is a charset conversion module in text form." << endl
<< ";" << endl
<< "; Automatically generated." << endl
<< ";" << endl
<< "0" << endl
<< "0" << endl
<< ";" << endl;
if(memcmp(from_charset, to_charset, 128 * sizeof(int)) == 0)
chr = 128;
file << ((chr == 128) ? 2 : 1) << endl
<< incp << "\t; from charset" << endl
<< outcp << "\t; to charset" << endl
<< ";" << endl;
for(; chr < 256; chr++)
{
int i;
if((chr != 0) && (from_charset[chr] == 0))
{
file << "\\0 ?" << "\t; not defined" << endl;
continue;
}
// try exact match first
i = lookup_char(from_charset[chr]);
if(i != -1)
{
file << "\\0 \\x" << hex << i << "\t; "
<< unicodedata[from_charset[chr]].name << endl;
continue;
}
// if not found exact match try to compose
int replacement_char1 = unicodedata[from_charset[chr]].replacement[0];
if(replacement_char1 != 0)
{
i = lookup_char(replacement_char1);
if(i != -1)
{
int replacement_char2 = unicodedata[from_charset[chr]].replacement[1];
if(replacement_char2 == 0)
{
file << "\\0 \\x" << hex << i << "\t; "
<< unicodedata[replacement_char1].name << endl;
continue;
}
int j = lookup_char(replacement_char2);
if(j != -1)
{
int replacement_char3 = unicodedata[from_charset[chr]].replacement[2];
if(replacement_char3 == 0)
{
file << "\\x" << hex << i << " \\x" << hex << j << "\t; "
<< unicodedata[replacement_char1].name << " + "
<< unicodedata[replacement_char2].name << endl;
continue;
}
int k = lookup_char(replacement_char3);
if(k != -1)
{
file << "\\x" << hex << i << " \\x" << hex << j << " \\x" << hex << k << "\t; "
<< unicodedata[replacement_char1].name << " + "
<< unicodedata[replacement_char2].name << " + "
<< unicodedata[replacement_char3].name << endl;
continue;
}
}
}
}
// if unable to compose through the standart data try our own
replacement_char1 = alt_unicodedata[from_charset[chr]].replacement[0];
if(replacement_char1 != 0)
{
i = lookup_char(replacement_char1);
if(i != -1)
{
int replacement_char2 = alt_unicodedata[from_charset[chr]].replacement[1];
if(replacement_char2 == 0)
{
file << "\\0 \\x" << hex << i << "\t; "
<< unicodedata[replacement_char1].name << endl;
continue;
}
int j = lookup_char(replacement_char2);
if(j != -1)
{
int replacement_char3 = alt_unicodedata[from_charset[chr]].replacement[2];
if(replacement_char3 == 0)
{
file << "\\x" << hex << i << " \\x" << hex << j << "\t; "
<< unicodedata[replacement_char1].name << " + "
<< unicodedata[replacement_char2].name << endl;
continue;
}
int k = lookup_char(replacement_char3);
if(k != -1)
{
file << "\\x" << hex << i << " \\x" << hex << j << " \\x" << hex << k << "\t; "
<< unicodedata[replacement_char1].name << " + "
<< unicodedata[replacement_char2].name << " + "
<< unicodedata[replacement_char3].name << endl;
continue;
}
}
}
}
// if not found exact match and unable to compose try to remove
// macrons, circumflexes, etc.
const char *letter;
const char *name = unicodedata[from_charset[chr]].name;
if((letter = strstr(name, "LETTER ")) != NULL)
{
letter += 7;
const char *letterend = strchr(letter, ' ');
char lettercopy[256];
int letterlen = ((letterend == NULL) ? strlen(name) : (letterend - name)) + 1;
if(letterlen > sizeof(lettercopy))
letterlen = sizeof(lettercopy);
strncpy(lettercopy, unicodedata[from_charset[chr]].name, letterlen);
lettercopy[letterlen - 1] = '\0';
i = lookup_char_by_name(lettercopy);
if(i != -1)
{
file << "\\0 \\x" << hex << i << "\t; " << lettercopy;
if(letterend)
file << " [" << (letterend + 1) << "]";
file << endl;
continue;
}
}
file << "\\0 ?" << "\t; (" << unicodedata[from_charset[chr]].name << ")" << endl;
}
file << "END" << endl;
}
else
{
cerr << "error opening file " << fn << endl;
}
}
int main(int ac, char *av[])
{
if(ac != 3)
{
cout << "usage: " << av[0] << " from_charset to_charset" << endl;
exit(-1);
}
load_charset(av[1], from_charset);
load_charset(av[2], to_charset);
load_unicodedata("UnicodeData.txt", unicodedata);
load_unicodedata("AltUnicodeData.txt", alt_unicodedata);
generate_table(av[1], av[2]);
return 0;
}