From 949f5e04efd80f2892f960f04a7454bc58f1d212 Mon Sep 17 00:00:00 2001 From: Mike Gabriel Date: Sat, 4 Feb 2017 10:52:05 +0100 Subject: doc/libNX_X11/lcUniConv: Move over the rather-documentary files 8bit_tab_to_h.c and cjk_tab_to_h.c to nx-libs's doc/ folder. --- doc/libNX_X11/lcUniConv/8bit_tab_to_h.c | 535 +++++++++++++++ doc/libNX_X11/lcUniConv/cjk_tab_to_h.c | 1071 +++++++++++++++++++++++++++++++ 2 files changed, 1606 insertions(+) create mode 100644 doc/libNX_X11/lcUniConv/8bit_tab_to_h.c create mode 100644 doc/libNX_X11/lcUniConv/cjk_tab_to_h.c (limited to 'doc') diff --git a/doc/libNX_X11/lcUniConv/8bit_tab_to_h.c b/doc/libNX_X11/lcUniConv/8bit_tab_to_h.c new file mode 100644 index 000000000..993979aeb --- /dev/null +++ b/doc/libNX_X11/lcUniConv/8bit_tab_to_h.c @@ -0,0 +1,535 @@ + +/* + * Generates an 8-bit character set table from a .TXT table as found on + * ftp.unicode.org or from a table containing the 256 Unicode values as + * hexadecimal integers. + * Examples: + * + * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < tab8859_1 + * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < tab8859_2 + * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < tab8859_3 + * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < tab8859_4 + * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < tab8859_5 + * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < tab8859_6 + * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < tab8859_7 + * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < tab8859_8 + * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < tab8859_9 + * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < tab8859_10 + * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < tab8859_14 + * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < tab8859_15 + * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < jis0201 + * ./8bit_tab_to_h TIS620-0 tis620 < tabtis620 + * ./8bit_tab_to_h KOI8-R koi8_r < tabkoi8_r + * ./8bit_tab_to_h KOI8-U koi8_u < tabkoi8_u + * ./8bit_tab_to_h ARMSCII-8 armscii_8 < tabarmscii_8 + * ./8bit_tab_to_h CP1133 cp1133 < tabibm_cp1133 + * ./8bit_tab_to_h MULELAO-1 mulelao < tabmulelao_1 + * ./8bit_tab_to_h VISCII1.1-1 viscii1 < tabviscii + * ./8bit_tab_to_h TCVN-5712 tcvn < tabtcvn + * ./8bit_tab_to_h GEORGIAN-ACADEMY georgian_ac < tabgeorgian_academy + * ./8bit_tab_to_h GEORGIAN-PS georgian_ps < tabgeorgian_ps + * + * ./8bit_tab_to_h ISO-8859-1 iso8859_1 < 8859-1.TXT + * ./8bit_tab_to_h ISO-8859-2 iso8859_2 < 8859-2.TXT + * ./8bit_tab_to_h ISO-8859-3 iso8859_3 < 8859-3.TXT + * ./8bit_tab_to_h ISO-8859-4 iso8859_4 < 8859-4.TXT + * ./8bit_tab_to_h ISO-8859-5 iso8859_5 < 8859-5.TXT + * ./8bit_tab_to_h ISO-8859-6 iso8859_6 < 8859-6.TXT + * ./8bit_tab_to_h ISO-8859-7 iso8859_7 < 8859-7.TXT + * ./8bit_tab_to_h ISO-8859-8 iso8859_8 < 8859-8.TXT + * ./8bit_tab_to_h ISO-8859-9 iso8859_9 < 8859-9.TXT + * ./8bit_tab_to_h ISO-8859-10 iso8859_10 < 8859-10.TXT + * ./8bit_tab_to_h ISO-8859-14 iso8859_14 < 8859-14.TXT + * ./8bit_tab_to_h ISO-8859-15 iso8859_15 < 8859-15.TXT + * ./8bit_tab_to_h JISX0201.1976-0 jisx0201 < JIS0201.TXT + * ./8bit_tab_to_h KOI8-R koi8_r < KOI8-R.TXT + */ + +#include +#include +#include +#include + +int main (int argc, char *argv[]) +{ + const char* charsetname; + const char* c_charsetname; + const char* filename; + const char* directory; + int charset2uni[0x100]; + + if (argc != 3 && argc != 4 && argc != 5) + exit(1); + charsetname = argv[1]; + c_charsetname = argv[2]; + if (argc > 3) { + filename = argv[3]; + } else { + char* s = malloc(strlen(c_charsetname)+strlen(".h")+1); + strcpy(s,c_charsetname); strcat(s,".h"); + filename = s; + } + directory = (argc > 4 ? argv[4] : ""); + + fprintf(stderr, "Creating %s%s\n", directory, filename); + + { + int i, c; + c = getc(stdin); + ungetc(c,stdin); + if (c == '#') { + /* Read a unicode.org style .TXT file. */ + for (i = 0; i < 0x100; i++) + charset2uni[i] = 0xfffd; + for (;;) { + c = getc(stdin); + if (c == EOF) + break; + if (c == '\n' || c == ' ' || c == '\t') + continue; + if (c == '#') { + do { c = getc(stdin); } while (!(c == EOF || c == '\n')); + continue; + } + ungetc(c,stdin); + if (scanf("0x%x", &i) != 1 || !(i >= 0 && i < 0x100)) + exit(1); + do { c = getc(stdin); } while (c == ' ' || c == '\t'); + if (c != EOF) + ungetc(c,stdin); + if (c == '\n' || c == '#') + continue; + if (scanf("0x%x", &charset2uni[i]) != 1) + exit(1); + } + } else { + /* Read a table of hexadecimal Unicode values. */ + for (i = 0; i < 0x100; i++) { + if (scanf("%x", &charset2uni[i]) != 1) + exit(1); + if (charset2uni[i] < 0 || charset2uni[i] == 0xffff) + charset2uni[i] = 0xfffd; + } + if (scanf("%x", &i) != EOF) + exit(1); + } + } + + /* Write the output file. */ + { + FILE* f; + + { + char* fname = malloc(strlen(directory)+strlen(filename)+1); + strcpy(fname,directory); strcat(fname,filename); + f = fopen(fname,"w"); + if (f == NULL) + exit(1); + } + + fprintf(f, "\n"); + fprintf(f, "/*\n"); + fprintf(f, " * %s\n", charsetname); + fprintf(f, " */\n"); + fprintf(f, "\n"); + + { + int i, i1, i2, i3; + int line[16]; + int tableno; + struct { int minline; int maxline; } tables[16]; + bool some_invalid; + bool final_ret_reached; + + for (i1 = 0; i1 < 16; i1++) { + bool all_invalid = true; + bool all_identity = true; + for (i2 = 0; i2 < 16; i2++) { + i = 16*i1+i2; + if (charset2uni[i] != 0xfffd) + all_invalid = false; + if (charset2uni[i] != i) + all_identity = false; + } + if (all_invalid) + line[i1] = -2; + else if (all_identity) + line[i1] = -1; + else + line[i1] = 0; + } + tableno = 0; + for (i1 = 0; i1 < 16; i1++) { + if (line[i1] >= 0) { + if (i1 > 0 && tableno > 0 && line[i1-1] == tableno-1) { + line[i1] = tableno-1; + tables[tableno-1].maxline = i1; + } else { + tableno++; + line[i1] = tableno-1; + tables[tableno-1].minline = tables[tableno-1].maxline = i1; + } + } + } + some_invalid = false; + for (i = 0; i < 0x100; i++) + if (charset2uni[i] == 0xfffd) + some_invalid = true; + if (tableno > 0) { + int t; + for (t = 0; t < tableno; t++) { + fprintf(f, "static const unsigned short %s_2uni", c_charsetname); + if (tableno > 1) + fprintf(f, "_%d", t+1); + fprintf(f, "[%d] = {\n", 16*(tables[t].maxline-tables[t].minline+1)); + for (i1 = tables[t].minline; i1 <= tables[t].maxline; i1++) { + fprintf(f, " /* 0x%02x */\n", 16*i1); + for (i2 = 0; i2 < 2; i2++) { + fprintf(f, " "); + for (i3 = 0; i3 < 8; i3++) { + i = 16*i1+8*i2+i3; + fprintf(f, " 0x%04x,", charset2uni[i]); + } + fprintf(f, "\n"); + } + } + fprintf(f, "};\n"); + } + fprintf(f, "\n"); + } + final_ret_reached = false; + fprintf(f, "static int\n%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", c_charsetname); + fprintf(f, "{\n"); + fprintf(f, " unsigned char c = *s;\n"); + if (some_invalid) { + for (i1 = 0; i1 < 16;) { + int t = line[i1]; + const char* indent; + for (i2 = i1; i2 < 16 && line[i2] == t; i2++); + indent = (i1 == 0 && i2 == 16 ? " " : " "); + if (i1 == 0) { + if (i2 == 16) { + } else { + fprintf(f, " if (c < 0x%02x) {\n", 16*i2); + } + } else { + if (i2 == 16) { + fprintf(f, " else {\n"); + } else { + fprintf(f, " else if (c < 0x%02x) {\n", 16*i2); + } + } + if (t == -2) { + final_ret_reached = true; + } else if (t == -1) { + fprintf(f, "%s*pwc = (ucs4_t) c;\n", indent); + fprintf(f, "%sreturn 1;\n", indent); + } else { + fprintf(f, "%s", indent); + some_invalid = false; + for (i = 16*i1; i < 16*i2; i++) + if (charset2uni[i] == 0xfffd) + some_invalid = true; + if (some_invalid) + fprintf(f, "unsigned short wc = "); + else + fprintf(f, "*pwc = (ucs4_t) "); + fprintf(f, "%s_2uni", c_charsetname); + if (tableno > 1) + fprintf(f, "_%d", t+1); + fprintf(f, "[c"); + if (tables[t].minline > 0) + fprintf(f, "-0x%02x", 16*tables[t].minline); + fprintf(f, "];\n"); + if (some_invalid) { + fprintf(f, "%sif (wc != 0xfffd) {\n", indent); + fprintf(f, "%s *pwc = (ucs4_t) wc;\n", indent); + fprintf(f, "%s return 1;\n", indent); + fprintf(f, "%s}\n", indent); + final_ret_reached = true; + } else { + fprintf(f, "%sreturn 1;\n", indent); + } + } + if (!(i1 == 0 && i2 == 16)) + fprintf(f, " }\n"); + i1 = i2; + } + if (final_ret_reached) + fprintf(f, " return RET_ILSEQ;\n"); + } else { + for (i1 = 0; i1 < 16;) { + int t = line[i1]; + for (i2 = i1; i2 < 16 && line[i2] == t; i2++); + if (i1 == 0) { + if (i2 == 16) { + fprintf(f, " "); + } else { + fprintf(f, " if (c < 0x%02x)\n ", 16*i2); + } + } else { + if (i2 == 16) { + fprintf(f, " else\n "); + } else { + fprintf(f, " else if (c < 0x%02x)\n ", 16*i2); + } + } + if (t == -1) + fprintf(f, "*pwc = (ucs4_t) c;\n"); + else { + fprintf(f, "*pwc = (ucs4_t) %s_2uni", c_charsetname); + if (tableno > 1) + fprintf(f, "_%d", t+1); + fprintf(f, "[c"); + if (tables[t].minline > 0) + fprintf(f, "-0x%02x", 16*tables[t].minline); + fprintf(f, "];\n"); + } + i1 = i2; + } + fprintf(f, " return 1;\n"); + } + fprintf(f, "}\n"); + + } + + fprintf(f, "\n"); + + { + int uni2charset[0x10000]; + bool pages[0x100]; + int line[0x2000]; + int tableno; + struct { int minline; int maxline; int usecount; const char* suffix; } tables[0x2000]; + bool need_c; + bool fix_0000; + int i, j, p, j1, j2, t; + + for (j = 0; j < 0x10000; j++) + uni2charset[j] = 0; + for (p = 0; p < 0x100; p++) + pages[p] = false; + for (i = 0; i < 0x100; i++) { + j = charset2uni[i]; + if (j != 0xfffd) { + uni2charset[j] = i; + pages[j>>8] = true; + } + } + for (j1 = 0; j1 < 0x2000; j1++) { + bool all_invalid = true; + bool all_identity = true; + for (j2 = 0; j2 < 8; j2++) { + j = 8*j1+j2; + if (uni2charset[j] != 0) + all_invalid = false; + if (uni2charset[j] != j) + all_identity = false; + } + if (all_invalid) + line[j1] = -2; + else if (all_identity) + line[j1] = -1; + else + line[j1] = 0; + } + tableno = 0; + for (j1 = 0; j1 < 0x2000; j1++) { + if (line[j1] >= 0) { + if (tableno > 0 + && ((j1 > 0 && line[j1-1] == tableno-1) + || ((tables[tableno-1].maxline >> 5) == (j1 >> 5) + && j1 - tables[tableno-1].maxline <= 8))) { + line[j1] = tableno-1; + tables[tableno-1].maxline = j1; + } else { + tableno++; + line[j1] = tableno-1; + tables[tableno-1].minline = tables[tableno-1].maxline = j1; + } + } + } + for (t = 0; t < tableno; t++) { + tables[t].usecount = 0; + j1 = 8*tables[t].minline; + j2 = 8*(tables[t].maxline+1); + for (j = j1; j < j2; j++) + if (uni2charset[j] != 0) + tables[t].usecount++; + } + for (t = 0, p = -1, i = 0; t < tableno; t++) { + if (tables[t].usecount > 1) { + char* s; + if (p == tables[t].minline >> 5) { + s = malloc(5+1); + sprintf(s, "%02x_%d", p, ++i); + } else { + p = tables[t].minline >> 5; + s = malloc(2+1); + sprintf(s, "%02x", p); + } + tables[t].suffix = s; + } else + tables[t].suffix = NULL; + } + { + p = -1; + for (t = 0; t < tableno; t++) + if (tables[t].usecount > 1) { + p = 0; + fprintf(f, "static const unsigned char %s_page%s[%d] = {\n", c_charsetname, tables[t].suffix, 8*(tables[t].maxline-tables[t].minline+1)); + for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) { + if ((j1 % 0x20) == 0 && j1 > tables[t].minline) + fprintf(f, " /* 0x%04x */\n", 8*j1); + fprintf(f, " "); + for (j2 = 0; j2 < 8; j2++) { + j = 8*j1+j2; + fprintf(f, " 0x%02x,", uni2charset[j]); + } + fprintf(f, " /* 0x%02x-0x%02x */\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7); + } + fprintf(f, "};\n"); + } + if (p >= 0) + fprintf(f, "\n"); + } + need_c = false; + for (j1 = 0; j1 < 0x2000;) { + t = line[j1]; + for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++); + if (t >= 0) + j2 = tables[t].maxline+1; + if (!(t == -2 || (t == -1 && j1 == 0))) + need_c = true; + j1 = j2; + } + fix_0000 = false; + fprintf(f, "static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", c_charsetname); + fprintf(f, "{\n"); + if (need_c) + fprintf(f, " unsigned char c = 0;\n"); + for (j1 = 0; j1 < 0x2000;) { + t = line[j1]; + for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++); + if (t >= 0) { + if (j1 != tables[t].minline) abort(); + if (j2 > tables[t].maxline+1) abort(); + j2 = tables[t].maxline+1; + } + if (t == -2) { + } else { + if (j1 == 0) + fprintf(f, " "); + else + fprintf(f, " else "); + if (t >= 0 && tables[t].usecount == 0) abort(); + if (t >= 0 && tables[t].usecount == 1) { + if (j2 != j1+1) abort(); + for (j = 8*j1; j < 8*j2; j++) + if (uni2charset[j] != 0) { + fprintf(f, "if (wc == 0x%04x)\n c = 0x%02x;\n", j, uni2charset[j]); + break; + } + } else { + if (j1 == 0) { + fprintf(f, "if (wc < 0x%04x)", 8*j2); + } else { + fprintf(f, "if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2); + } + if (t == -1) { + if (j1 == 0) + /* If wc == 0, the function must return 1, not -1. */ + fprintf(f, " {\n *r = wc;\n return 1;\n }\n"); + else + fprintf(f, "\n c = wc;\n"); + } else { + fprintf(f, "\n c = %s_page%s[wc", c_charsetname, tables[t].suffix); + if (tables[t].minline > 0) + fprintf(f, "-0x%04x", 8*j1); + fprintf(f, "];\n"); + if (j1 == 0 && uni2charset[0] == 0) + /* If wc == 0, the function must return 1, not -1. */ + fix_0000 = true; + } + } + } + j1 = j2; + } + if (need_c) { + if (fix_0000) + fprintf(f, " if (c != 0 || wc == 0) {\n"); + else + fprintf(f, " if (c != 0) {\n"); + fprintf(f, " *r = c;\n"); + fprintf(f, " return 1;\n"); + fprintf(f, " }\n"); + } + fprintf(f, " return RET_ILSEQ;\n"); + fprintf(f, "}\n"); + + } + + if (ferror(f) || fclose(f)) + exit(1); + } + +#if 0 + + int i1, i2, i3, i1_min, i1_max, j1, j2; + + i1_min = 16; + i1_max = -1; + for (i1 = 0; i1 < 16; i1++) + for (i2 = 0; i2 < 16; i2++) + if (charset2uni[16*i1+i2] != 0xfffd) { + if (i1_min > i1) i1_min = i1; + if (i1_max < i1) i1_max = i1; + } + printf("static const unsigned short %s_2uni[%d] = {\n", + name, 16*(i1_max-i1_min+1)); + for (i1 = i1_min; i1 <= i1_max; i1++) { + printf(" /""* 0x%02x *""/\n", 16*i1); + for (i2 = 0; i2 < 2; i2++) { + printf(" "); + for (i3 = 0; i3 < 8; i3++) { + if (i3 > 0) printf(" "); + printf("0x%04x,", charset2uni[16*i1+8*i2+i3]); + } + printf("\n"); + } + } + printf("};\n"); + printf("\n"); + + for (p = 0; p < 0x100; p++) + pages[p] = 0; + for (i = 0; i < 0x100; i++) + if (charset2uni[i] != 0xfffd) + pages[charset2uni[i]>>8] = 1; + for (p = 0; p < 0x100; p++) + if (pages[p]) { + int j1_min = 32; + int j1_max = -1; + for (j1 = 0; j1 < 32; j1++) + for (j2 = 0; j2 < 8; j2++) + if (uni2charset[256*p+8*j1+j2] != 0) { + if (j1_min > j1) j1_min = j1; + if (j1_max < j1) j1_max = j1; + } + printf("static const unsigned char %s_page%02x[%d] = {\n", + name, p, 8*(j1_max-j1_min+1)); + for (j1 = j1_min; j1 <= j1_max; j1++) { + printf(" "); + for (j2 = 0; j2 < 8; j2++) + printf("0x%02x, ", uni2charset[256*p+8*j1+j2]); + printf("/""* 0x%02x-0x%02x *""/\n", 8*j1, 8*j1+7); + } + printf("};\n"); + } + printf("\n"); + +} +#endif + + exit(0); +} diff --git a/doc/libNX_X11/lcUniConv/cjk_tab_to_h.c b/doc/libNX_X11/lcUniConv/cjk_tab_to_h.c new file mode 100644 index 000000000..f70fe5e11 --- /dev/null +++ b/doc/libNX_X11/lcUniConv/cjk_tab_to_h.c @@ -0,0 +1,1071 @@ + +/* + * Generates a CJK character set table from a .TXT table as found on + * ftp.unicode.org or in the X nls directory. + * Examples: + * + * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < gb2312 + * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < jis0208 + * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < ksc5601 + * + * ./cjk_tab_to_h GB2312.1980-0 gb2312 > gb2312.h < GB2312.TXT + * ./cjk_tab_to_h JISX0208.1983-0 jisx0208 > jisx0208.h < JIS0208.TXT + * ./cjk_tab_to_h JISX0212.1990-0 jisx0212 > jisx0212.h < JIS0212.TXT + * ./cjk_tab_to_h KSC5601.1987-0 ksc5601 > ksc5601.h < KSC5601.TXT + * ./cjk_tab_to_h KSX1001.1992-0 ksc5601 > ksc5601.h < KSX1001.TXT + * + * ./cjk_tab_to_h BIG5 big5 > big5.h < BIG5.TXT + * + * ./cjk_tab_to_h JOHAB johab > johab.h < JOHAB.TXT + * + * ./cjk_tab_to_h BIG5HKSCS-0 big5hkscs >big5hkscs.h < BIG5HKSCS.TXT + */ + +#include +#include +#include +#include + +typedef struct { + int start; + int end; +} Block; + +typedef struct { + int rows; /* number of possible values for the 1st byte */ + int cols; /* number of possible values for the 2nd byte */ + int (*row_byte) (int row); /* returns the 1st byte value for a given row */ + int (*col_byte) (int col); /* returns the 2nd byte value for a given col */ + int (*byte_row) (int byte); /* converts a 1st byte value to a row, else -1 */ + int (*byte_col) (int byte); /* converts a 2nd byte value to a col, else -1 */ + const char* check_row_expr; /* format string for 1st byte value checking */ + const char* check_col_expr; /* format string for 2nd byte value checking */ + const char* byte_row_expr; /* format string for 1st byte value to row */ + const char* byte_col_expr; /* format string for 2nd byte value to col */ + int** charset2uni; /* charset2uni[0..rows-1][0..cols-1] is valid */ + /* You'll understand the terms "row" and "col" when you buy Ken Lunde's book. + Once a row is fixed, choosing a "col" is the same as choosing a "cell". */ + int* charsetpage; /* charsetpage[0..rows]: how large is a page for a row */ + int ncharsetblocks; + Block* charsetblocks; /* blocks[0..nblocks-1] */ + int* uni2charset; /* uni2charset[0x0000..0xffff] */ +} Encoding; + +/* + * Outputs the file title. + */ +static void output_title (const char *charsetname) +{ + printf("\n"); + printf("/*\n"); + printf(" * %s\n", charsetname); + printf(" */\n"); + printf("\n"); +} + +/* + * Reads the charset2uni table from standard input. + */ +static void read_table (Encoding* enc) +{ + int row, col, i, i1, i2, c, j; + + enc->charset2uni = malloc(enc->rows*sizeof(int*)); + for (row = 0; row < enc->rows; row++) + enc->charset2uni[row] = malloc(enc->cols*sizeof(int)); + + for (row = 0; row < enc->rows; row++) + for (col = 0; col < enc->cols; col++) + enc->charset2uni[row][col] = 0xfffd; + + c = getc(stdin); + ungetc(c,stdin); + if (c == '#') { + /* Read a unicode.org style .TXT file. */ + for (;;) { + c = getc(stdin); + if (c == EOF) + break; + if (c == '\n' || c == ' ' || c == '\t') + continue; + if (c == '#') { + do { c = getc(stdin); } while (!(c == EOF || c == '\n')); + continue; + } + ungetc(c,stdin); + if (scanf("0x%x", &j) != 1) + exit(1); + i1 = j >> 8; + i2 = j & 0xff; + row = enc->byte_row(i1); + col = enc->byte_col(i2); + if (row < 0 || col < 0) { + fprintf(stderr, "lost entry for %02x %02x\n", i1, i2); + exit(1); + } + if (scanf(" 0x%x", &enc->charset2uni[row][col]) != 1) + exit(1); + } + } else { + /* Read a table of hexadecimal Unicode values. */ + for (i1 = 32; i1 < 132; i1++) + for (i2 = 32; i2 < 132; i2++) { + i = scanf("%x", &j); + if (i == EOF) + goto read_done; + if (i != 1) + exit(1); + if (j < 0 || j == 0xffff) + j = 0xfffd; + if (j != 0xfffd) { + if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) { + fprintf(stderr, "lost entry at %02x %02x\n", i1, i2); + exit (1); + } + enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j; + } + } + read_done: ; + } +} + +/* + * Computes the charsetpage[0..rows] array. + */ +static void find_charset2uni_pages (Encoding* enc) +{ + int row, col; + + enc->charsetpage = malloc((enc->rows+1)*sizeof(int)); + + for (row = 0; row <= enc->rows; row++) + enc->charsetpage[row] = 0; + + for (row = 0; row < enc->rows; row++) { + int used = 0; + for (col = 0; col < enc->cols; col++) + if (enc->charset2uni[row][col] != 0xfffd) + used = col+1; + enc->charsetpage[row] = used; + } +} + +/* + * Fills in nblocks and blocks. + */ +static void find_charset2uni_blocks (Encoding* enc) +{ + int n, row, lastrow; + + enc->charsetblocks = malloc(enc->rows*sizeof(Block)); + + n = 0; + for (row = 0; row < enc->rows; row++) + if (enc->charsetpage[row] > 0 && (row == 0 || enc->charsetpage[row-1] == 0)) { + for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); + enc->charsetblocks[n].start = row * enc->cols; + enc->charsetblocks[n].end = lastrow * enc->cols + enc->charsetpage[lastrow]; + n++; + } + enc->ncharsetblocks = n; +} + +/* + * Outputs the charset to unicode table and function. + */ +static void output_charset2uni (const char* name, Encoding* enc) +{ + int row, col, lastrow, col_max, i, i1_min, i1_max; + + find_charset2uni_pages(enc); + + find_charset2uni_blocks(enc); + + for (row = 0; row < enc->rows; row++) + if (enc->charsetpage[row] > 0) { + if (row == 0 || enc->charsetpage[row-1] == 0) { + /* Start a new block. */ + for (lastrow = row; enc->charsetpage[lastrow+1] > 0; lastrow++); + printf("static const unsigned short %s_2uni_page%02x[%d] = {\n", + name, enc->row_byte(row), + (lastrow-row) * enc->cols + enc->charsetpage[lastrow]); + } + printf(" /""* 0x%02x *""/\n ", enc->row_byte(row)); + col_max = (enc->charsetpage[row+1] > 0 ? enc->cols : enc->charsetpage[row]); + for (col = 0; col < col_max; col++) { + printf(" 0x%04x,", enc->charset2uni[row][col]); + if ((col % 8) == 7 && (col+1 < col_max)) printf("\n "); + } + printf("\n"); + if (enc->charsetpage[row+1] == 0) { + /* End a block. */ + printf("};\n"); + } + } + printf("\n"); + + printf("static int\n"); + printf("%s_mbtowc (conv_t conv, ucs4_t *pwc, const unsigned char *s, int n)\n", name); + printf("{\n"); + printf(" unsigned char c1 = s[0];\n"); + printf(" if ("); + for (i = 0; i < enc->ncharsetblocks; i++) { + i1_min = enc->row_byte(enc->charsetblocks[i].start / enc->cols); + i1_max = enc->row_byte((enc->charsetblocks[i].end-1) / enc->cols); + if (i > 0) + printf(" || "); + if (i1_min == i1_max) + printf("(c1 == 0x%02x)", i1_min); + else + printf("(c1 >= 0x%02x && c1 <= 0x%02x)", i1_min, i1_max); + } + printf(") {\n"); + printf(" if (n >= 2) {\n"); + printf(" unsigned char c2 = s[1];\n"); + printf(" if ("); + printf(enc->check_col_expr, "c2"); + printf(") {\n"); + printf(" unsigned int i = %d * (", enc->cols); + printf(enc->byte_row_expr, "c1"); + printf(") + ("); + printf(enc->byte_col_expr, "c2"); + printf(");\n"); + printf(" unsigned short wc = 0xfffd;\n"); + for (i = 0; i < enc->ncharsetblocks; i++) { + printf(" "); + if (i > 0) + printf("} else "); + if (i < enc->ncharsetblocks-1) + printf("if (i < %d) ", enc->charsetblocks[i+1].start); + printf("{\n"); + printf(" if (i < %d)\n", enc->charsetblocks[i].end); + printf(" wc = %s_2uni_page%02x[i", name, enc->row_byte(enc->charsetblocks[i].start / enc->cols)); + if (enc->charsetblocks[i].start > 0) + printf("-%d", enc->charsetblocks[i].start); + printf("];\n"); + } + printf(" }\n"); + printf(" if (wc != 0xfffd) {\n"); + printf(" *pwc = (ucs4_t) wc;\n"); + printf(" return 2;\n"); + printf(" }\n"); + printf(" }\n"); + printf(" return RET_ILSEQ;\n"); + printf(" }\n"); + printf(" return RET_TOOFEW(0);\n"); + printf(" }\n"); + printf(" return RET_ILSEQ;\n"); + printf("}\n"); + printf("\n"); +} + +/* + * Computes the uni2charset[0x0000..0xffff] array. + */ +static void invert (Encoding* enc) +{ + int row, col, j; + + enc->uni2charset = malloc(0x10000*sizeof(int)); + + for (j = 0; j < 0x10000; j++) + enc->uni2charset[j] = 0; + + for (row = 0; row < enc->rows; row++) + for (col = 0; col < enc->cols; col++) { + j = enc->charset2uni[row][col]; + if (j != 0xfffd) + enc->uni2charset[j] = 0x100 * enc->row_byte(row) + enc->col_byte(col); + } +} + +/* + * Outputs the unicode to charset table and function, using a linear array. + * (Suitable if the table is dense.) + */ +static void output_uni2charset_dense (const char* name, Encoding* enc) +{ + /* Like in 8bit_tab_to_h.c */ + bool pages[0x100]; + int line[0x2000]; + int tableno; + struct { int minline; int maxline; int usecount; } tables[0x2000]; + bool first; + int row, col, j, p, j1, j2, t; + + for (p = 0; p < 0x100; p++) + pages[p] = false; + for (row = 0; row < enc->rows; row++) + for (col = 0; col < enc->cols; col++) { + j = enc->charset2uni[row][col]; + if (j != 0xfffd) + pages[j>>8] = true; + } + for (j1 = 0; j1 < 0x2000; j1++) { + bool all_invalid = true; + for (j2 = 0; j2 < 8; j2++) { + j = 8*j1+j2; + if (enc->uni2charset[j] != 0) + all_invalid = false; + } + if (all_invalid) + line[j1] = -1; + else + line[j1] = 0; + } + tableno = 0; + for (j1 = 0; j1 < 0x2000; j1++) { + if (line[j1] >= 0) { + if (tableno > 0 + && ((j1 > 0 && line[j1-1] == tableno-1) + || ((tables[tableno-1].maxline >> 5) == (j1 >> 5) + && j1 - tables[tableno-1].maxline <= 8))) { + line[j1] = tableno-1; + tables[tableno-1].maxline = j1; + } else { + tableno++; + line[j1] = tableno-1; + tables[tableno-1].minline = tables[tableno-1].maxline = j1; + } + } + } + for (t = 0; t < tableno; t++) { + tables[t].usecount = 0; + j1 = 8*tables[t].minline; + j2 = 8*(tables[t].maxline+1); + for (j = j1; j < j2; j++) + if (enc->uni2charset[j] != 0) + tables[t].usecount++; + } + { + p = -1; + for (t = 0; t < tableno; t++) + if (tables[t].usecount > 1) { + p = tables[t].minline >> 5; + printf("static const unsigned short %s_page%02x[%d] = {\n", name, p, 8*(tables[t].maxline-tables[t].minline+1)); + for (j1 = tables[t].minline; j1 <= tables[t].maxline; j1++) { + if ((j1 % 0x20) == 0 && j1 > tables[t].minline) + printf(" /* 0x%04x */\n", 8*j1); + printf(" "); + for (j2 = 0; j2 < 8; j2++) { + j = 8*j1+j2; + printf(" 0x%04x,", enc->uni2charset[j]); + } + printf(" /*0x%02x-0x%02x*/\n", 8*(j1 % 0x20), 8*(j1 % 0x20)+7); + } + printf("};\n"); + } + if (p >= 0) + printf("\n"); + } + printf("static int\n%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name); + printf("{\n"); + printf(" if (n >= 2) {\n"); + printf(" unsigned short c = 0;\n"); + first = true; + for (j1 = 0; j1 < 0x2000;) { + t = line[j1]; + for (j2 = j1; j2 < 0x2000 && line[j2] == t; j2++); + if (t >= 0) { + if (j1 != tables[t].minline) abort(); + if (j2 > tables[t].maxline+1) abort(); + j2 = tables[t].maxline+1; + if (first) + printf(" "); + else + printf(" else "); + first = false; + if (tables[t].usecount == 0) abort(); + if (tables[t].usecount == 1) { + if (j2 != j1+1) abort(); + for (j = 8*j1; j < 8*j2; j++) + if (enc->uni2charset[j] != 0) { + printf("if (wc == 0x%04x)\n c = 0x%02x;\n", j, enc->uni2charset[j]); + break; + } + } else { + if (j1 == 0) { + printf("if (wc < 0x%04x)", 8*j2); + } else { + printf("if (wc >= 0x%04x && wc < 0x%04x)", 8*j1, 8*j2); + } + printf("\n c = %s_page%02x[wc", name, j1 >> 5); + if (tables[t].minline > 0) + printf("-0x%04x", 8*j1); + printf("];\n"); + } + } + j1 = j2; + } + printf(" if (c != 0) {\n"); + printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n"); + printf(" return 2;\n"); + printf(" }\n"); + printf(" return RET_ILSEQ;\n"); + printf(" }\n"); + printf(" return RET_TOOSMALL;\n"); + printf("}\n"); +} + +/* + * Outputs the unicode to charset table and function, using a packed array. + * (Suitable if the table is sparse.) + */ +static void output_uni2charset_sparse (const char* name, Encoding* enc) +{ + bool pages[0x100]; + Block pageblocks[0x100]; int npageblocks; + int indx2charset[0x10000]; + int summary_indx[0x1000]; + int summary_used[0x1000]; + int i, row, col, j, p, j1, j2, indx; + + /* Fill pages[0x100]. */ + for (p = 0; p < 0x100; p++) + pages[p] = false; + for (row = 0; row < enc->rows; row++) + for (col = 0; col < enc->cols; col++) { + j = enc->charset2uni[row][col]; + if (j != 0xfffd) + pages[j>>8] = true; + } + +#if 0 + for (p = 0; p < 0x100; p++) + if (pages[p]) { + printf("static const unsigned short %s_page%02x[256] = {\n", name, p); + for (j1 = 0; j1 < 32; j1++) { + printf(" "); + for (j2 = 0; j2 < 8; j2++) + printf("0x%04x, ", enc->uni2charset[256*p+8*j1+j2]); + printf("/""*0x%02x-0x%02x*""/\n", 8*j1, 8*j1+7); + } + printf("};\n"); + } + printf("\n"); +#endif + + /* Fill summary_indx[] and summary_used[]. */ + indx = 0; + for (j1 = 0; j1 < 0x1000; j1++) { + summary_indx[j1] = indx; + summary_used[j1] = 0; + for (j2 = 0; j2 < 16; j2++) { + j = 16*j1+j2; + if (enc->uni2charset[j] != 0) { + indx2charset[indx++] = enc->uni2charset[j]; + summary_used[j1] |= (1 << j2); + } + } + } + + /* Fill npageblocks and pageblocks[]. */ + npageblocks = 0; + for (p = 0; p < 0x100; ) { + if (pages[p] && (p == 0 || !pages[p-1])) { + pageblocks[npageblocks].start = 16*p; + do p++; while (p < 0x100 && pages[p]); + j1 = 16*p; + while (summary_used[j1-1] == 0) j1--; + pageblocks[npageblocks].end = j1; + npageblocks++; + } else + p++; + } + + printf("static const unsigned short %s_2charset[%d] = {\n", name, indx); + for (i = 0; i < indx; ) { + if ((i % 8) == 0) printf(" "); + printf(" 0x%04x,", indx2charset[i]); + i++; + if ((i % 8) == 0 || i == indx) printf("\n"); + } + printf("};\n"); + printf("\n"); + for (i = 0; i < npageblocks; i++) { + printf("static const Summary16 %s_uni2indx_page%02x[%d] = {\n", name, + pageblocks[i].start/16, pageblocks[i].end-pageblocks[i].start); + for (j1 = pageblocks[i].start; j1 < pageblocks[i].end; ) { + if (((16*j1) % 0x100) == 0) printf(" /""* 0x%04x *""/\n", 16*j1); + if ((j1 % 4) == 0) printf(" "); + printf(" { %4d, 0x%04x },", summary_indx[j1], summary_used[j1]); + j1++; + if ((j1 % 4) == 0 || j1 == pageblocks[i].end) printf("\n"); + } + printf("};\n"); + } + printf("\n"); + + printf("static int\n"); + printf("%s_wctomb (conv_t conv, unsigned char *r, ucs4_t wc, int n)\n", name); + printf("{\n"); + printf(" if (n >= 2) {\n"); + printf(" const Summary16 *summary = NULL;\n"); + for (i = 0; i < npageblocks; i++) { + printf(" "); + if (i > 0) + printf("else "); + printf("if (wc >= 0x%04x && wc < 0x%04x)\n", + 16*pageblocks[i].start, 16*pageblocks[i].end); + printf(" summary = &%s_uni2indx_page%02x[(wc>>4)", name, + pageblocks[i].start/16); + if (pageblocks[i].start > 0) + printf("-0x%03x", pageblocks[i].start); + printf("];\n"); + } + printf(" if (summary) {\n"); + printf(" unsigned short used = summary->used;\n"); + printf(" unsigned int i = wc & 0x0f;\n"); + printf(" if (used & ((unsigned short) 1 << i)) {\n"); + printf(" unsigned short c;\n"); + printf(" /* Keep in `used' only the bits 0..i-1. */\n"); + printf(" used &= ((unsigned short) 1 << i) - 1;\n"); + printf(" /* Add `summary->indx' and the number of bits set in `used'. */\n"); + printf(" used = (used & 0x5555) + ((used & 0xaaaa) >> 1);\n"); + printf(" used = (used & 0x3333) + ((used & 0xcccc) >> 2);\n"); + printf(" used = (used & 0x0f0f) + ((used & 0xf0f0) >> 4);\n"); + printf(" used = (used & 0x00ff) + (used >> 8);\n"); + printf(" c = %s_2charset[summary->indx + used];\n", name); + printf(" r[0] = (c >> 8); r[1] = (c & 0xff);\n"); + printf(" return 2;\n"); + printf(" }\n"); + printf(" }\n"); + printf(" return RET_ILSEQ;\n"); + printf(" }\n"); + printf(" return RET_TOOSMALL;\n"); + printf("}\n"); +} + +/* ISO-2022/EUC specifics */ + +static int row_byte_normal (int row) { return 0x21+row; } +static int col_byte_normal (int col) { return 0x21+col; } +static int byte_row_normal (int byte) { return byte-0x21; } +static int byte_col_normal (int byte) { return byte-0x21; } + +static void do_normal (const char* name) +{ + Encoding enc; + + enc.rows = 94; + enc.cols = 94; + enc.row_byte = row_byte_normal; + enc.col_byte = col_byte_normal; + enc.byte_row = byte_row_normal; + enc.byte_col = byte_col_normal; + enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; + enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; + enc.byte_row_expr = "%1$s - 0x21"; + enc.byte_col_expr = "%1$s - 0x21"; + + read_table(&enc); + output_charset2uni(name,&enc); + invert(&enc); output_uni2charset_sparse(name,&enc); +} + +/* Note: On first sight, the jisx0212_2charset[] table seems to be in order, + starting from the charset=0x3021/uni=0x4e02 pair. But it's only mostly in + order. There are 75 out-of-order values, scattered all throughout the table. + */ + +static void do_normal_only_charset2uni (const char* name) +{ + Encoding enc; + + enc.rows = 94; + enc.cols = 94; + enc.row_byte = row_byte_normal; + enc.col_byte = col_byte_normal; + enc.byte_row = byte_row_normal; + enc.byte_col = byte_col_normal; + enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; + enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; + enc.byte_row_expr = "%1$s - 0x21"; + enc.byte_col_expr = "%1$s - 0x21"; + + read_table(&enc); + output_charset2uni(name,&enc); +} + +/* CNS 11643 specifics - trick to put two tables into one */ + +static int row_byte_cns11643 (int row) { + return 0x100 * (row / 94) + (row % 94) + 0x21; +} +static int byte_row_cns11643 (int byte) { + return (byte >= 0x100 && byte < 0x200 ? byte-0x121 : + byte >= 0x200 && byte < 0x300 ? byte-0x221+94 : + byte >= 0x300 && byte < 0x400 ? byte-0x321+2*94 : + -1); +} + +static void do_cns11643_only_uni2charset (const char* name) +{ + Encoding enc; + int j, x; + + enc.rows = 3*94; + enc.cols = 94; + enc.row_byte = row_byte_cns11643; + enc.col_byte = col_byte_normal; + enc.byte_row = byte_row_cns11643; + enc.byte_col = byte_col_normal; + enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; + enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; + enc.byte_row_expr = "%1$s - 0x21"; + enc.byte_col_expr = "%1$s - 0x21"; + + read_table(&enc); + invert(&enc); + /* Move the 2 plane bits into the unused bits 15 and 7. */ + for (j = 0; j < 0x10000; j++) { + x = enc.uni2charset[j]; + if (x != 0) { + if (x & 0x8080) abort(); + switch (x >> 16) { + case 0: /* plane 1 */ x = (x & 0xffff) | 0x0000; break; + case 1: /* plane 2 */ x = (x & 0xffff) | 0x0080; break; + case 2: /* plane 3 */ x = (x & 0xffff) | 0x8000; break; + default: abort(); + } + enc.uni2charset[j] = x; + } + } + output_uni2charset_sparse(name,&enc); +} + +/* GBK specifics */ + +static int row_byte_gbk1 (int row) { + return 0x81+row; +} +static int col_byte_gbk1 (int col) { + return (col >= 0x3f ? 0x41 : 0x40) + col; +} +static int byte_row_gbk1 (int byte) { + if (byte >= 0x81 && byte < 0xff) + return byte-0x81; + else + return -1; +} +static int byte_col_gbk1 (int byte) { + if (byte >= 0x40 && byte < 0x7f) + return byte-0x40; + else if (byte >= 0x80 && byte < 0xff) + return byte-0x41; + else + return -1; +} + +static void do_gbk1 (const char* name) +{ + Encoding enc; + + enc.rows = 126; + enc.cols = 190; + enc.row_byte = row_byte_gbk1; + enc.col_byte = col_byte_gbk1; + enc.byte_row = byte_row_gbk1; + enc.byte_col = byte_col_gbk1; + enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; + enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; + enc.byte_row_expr = "%1$s - 0x81"; + enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; + + read_table(&enc); + output_charset2uni(name,&enc); + invert(&enc); output_uni2charset_dense(name,&enc); +} + +static void do_gbk1_only_charset2uni (const char* name) +{ + Encoding enc; + + enc.rows = 126; + enc.cols = 190; + enc.row_byte = row_byte_gbk1; + enc.col_byte = col_byte_gbk1; + enc.byte_row = byte_row_gbk1; + enc.byte_col = byte_col_gbk1; + enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; + enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; + enc.byte_row_expr = "%1$s - 0x81"; + enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; + + read_table(&enc); + output_charset2uni(name,&enc); +} + +static int row_byte_gbk2 (int row) { + return 0x81+row; +} +static int col_byte_gbk2 (int col) { + return (col >= 0x3f ? 0x41 : 0x40) + col; +} +static int byte_row_gbk2 (int byte) { + if (byte >= 0x81 && byte < 0xff) + return byte-0x81; + else + return -1; +} +static int byte_col_gbk2 (int byte) { + if (byte >= 0x40 && byte < 0x7f) + return byte-0x40; + else if (byte >= 0x80 && byte < 0xa1) + return byte-0x41; + else + return -1; +} + +static void do_gbk2_only_charset2uni (const char* name) +{ + Encoding enc; + + enc.rows = 126; + enc.cols = 96; + enc.row_byte = row_byte_gbk2; + enc.col_byte = col_byte_gbk2; + enc.byte_row = byte_row_gbk2; + enc.byte_col = byte_col_gbk2; + enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; + enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xa1)"; + enc.byte_row_expr = "%1$s - 0x81"; + enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; + + read_table(&enc); + output_charset2uni(name,&enc); +} + +static void do_gbk1_only_uni2charset (const char* name) +{ + Encoding enc; + + enc.rows = 126; + enc.cols = 190; + enc.row_byte = row_byte_gbk1; + enc.col_byte = col_byte_gbk1; + enc.byte_row = byte_row_gbk1; + enc.byte_col = byte_col_gbk1; + enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; + enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xff)"; + enc.byte_row_expr = "%1$s - 0x81"; + enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; + + read_table(&enc); + invert(&enc); output_uni2charset_sparse(name,&enc); +} + +/* KSC 5601 specifics */ + +/* + * Reads the charset2uni table from standard input. + */ +static void read_table_ksc5601 (Encoding* enc) +{ + int row, col, i, i1, i2, c, j; + + enc->charset2uni = malloc(enc->rows*sizeof(int*)); + for (row = 0; row < enc->rows; row++) + enc->charset2uni[row] = malloc(enc->cols*sizeof(int)); + + for (row = 0; row < enc->rows; row++) + for (col = 0; col < enc->cols; col++) + enc->charset2uni[row][col] = 0xfffd; + + c = getc(stdin); + ungetc(c,stdin); + if (c == '#') { + /* Read a unicode.org style .TXT file. */ + for (;;) { + c = getc(stdin); + if (c == EOF) + break; + if (c == '\n' || c == ' ' || c == '\t') + continue; + if (c == '#') { + do { c = getc(stdin); } while (!(c == EOF || c == '\n')); + continue; + } + ungetc(c,stdin); + if (scanf("0x%x", &j) != 1) + exit(1); + i1 = j >> 8; + i2 = j & 0xff; + if (scanf(" 0x%x", &j) != 1) + exit(1); + /* Take only the range covered by KS C 5601.1987-0 = KS C 5601.1989-0 + = KS X 1001.1992, ignore the rest. */ + if (!(i1 >= 128+33 && i1 < 128+127 && i2 >= 128+33 && i2 < 128+127)) + continue; /* KSC5601 specific */ + i1 &= 0x7f; /* KSC5601 specific */ + i2 &= 0x7f; /* KSC5601 specific */ + row = enc->byte_row(i1); + col = enc->byte_col(i2); + if (row < 0 || col < 0) { + fprintf(stderr, "lost entry for %02x %02x\n", i1, i2); + exit(1); + } + enc->charset2uni[row][col] = j; + } + } else { + /* Read a table of hexadecimal Unicode values. */ + for (i1 = 33; i1 < 127; i1++) + for (i2 = 33; i2 < 127; i2++) { + i = scanf("%x", &j); + if (i == EOF) + goto read_done; + if (i != 1) + exit(1); + if (j < 0 || j == 0xffff) + j = 0xfffd; + if (j != 0xfffd) { + if (enc->byte_row(i1) < 0 || enc->byte_col(i2) < 0) { + fprintf(stderr, "lost entry at %02x %02x\n", i1, i2); + exit (1); + } + enc->charset2uni[enc->byte_row(i1)][enc->byte_col(i2)] = j; + } + } + read_done: ; + } +} + +static void do_ksc5601 (const char* name) +{ + Encoding enc; + + enc.rows = 94; + enc.cols = 94; + enc.row_byte = row_byte_normal; + enc.col_byte = col_byte_normal; + enc.byte_row = byte_row_normal; + enc.byte_col = byte_col_normal; + enc.check_row_expr = "%1$s >= 0x21 && %1$s < 0x7f"; + enc.check_col_expr = "%1$s >= 0x21 && %1$s < 0x7f"; + enc.byte_row_expr = "%1$s - 0x21"; + enc.byte_col_expr = "%1$s - 0x21"; + + read_table_ksc5601(&enc); + output_charset2uni(name,&enc); + invert(&enc); output_uni2charset_sparse(name,&enc); +} + +/* Big5 specifics */ + +static int row_byte_big5 (int row) { + return 0xa1+row; +} +static int col_byte_big5 (int col) { + return (col >= 0x3f ? 0x62 : 0x40) + col; +} +static int byte_row_big5 (int byte) { + if (byte >= 0xa1 && byte < 0xff) + return byte-0xa1; + else + return -1; +} +static int byte_col_big5 (int byte) { + if (byte >= 0x40 && byte < 0x7f) + return byte-0x40; + else if (byte >= 0xa1 && byte < 0xff) + return byte-0x62; + else + return -1; +} + +static void do_big5 (const char* name) +{ + Encoding enc; + + enc.rows = 94; + enc.cols = 157; + enc.row_byte = row_byte_big5; + enc.col_byte = col_byte_big5; + enc.byte_row = byte_row_big5; + enc.byte_col = byte_col_big5; + enc.check_row_expr = "%1$s >= 0xa1 && %1$s < 0xff"; + enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)"; + enc.byte_row_expr = "%1$s - 0xa1"; + enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)"; + + read_table(&enc); + output_charset2uni(name,&enc); + invert(&enc); output_uni2charset_sparse(name,&enc); +} + +/* Big5-HKSCS specifics */ + +static int row_byte_big5hkscs (int row) { + return 0x81+row; +} +static int col_byte_big5hkscs (int col) { + return (col >= 0x3f ? 0x62 : 0x40) + col; +} +static int byte_row_big5hkscs (int byte) { + if (byte >= 0x81 && byte < 0xff) + return byte-0x81; + else + return -1; +} +static int byte_col_big5hkscs (int byte) { + if (byte >= 0x40 && byte < 0x7f) + return byte-0x40; + else if (byte >= 0xa1 && byte < 0xff) + return byte-0x62; + else + return -1; +} + +static void do_big5hkscs (const char* name) +{ + Encoding enc; + + enc.rows = 126; + enc.cols = 157; + enc.row_byte = row_byte_big5hkscs; + enc.col_byte = col_byte_big5hkscs; + enc.byte_row = byte_row_big5hkscs; + enc.byte_col = byte_col_big5hkscs; + enc.check_row_expr = "%1$s >= 0x81 && %1$s < 0xff"; + enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0xa1 && %1$s < 0xff)"; + enc.byte_row_expr = "%1$s - 0x81"; + enc.byte_col_expr = "%1$s - (%1$s >= 0xa1 ? 0x62 : 0x40)"; + + read_table(&enc); + output_charset2uni(name,&enc); + invert(&enc); output_uni2charset_sparse(name,&enc); +} + +/* Johab Hangul specifics */ + +static int row_byte_johab_hangul (int row) { + return 0x84+row; +} +static int col_byte_johab_hangul (int col) { + return (col >= 0x3e ? 0x43 : 0x41) + col; +} +static int byte_row_johab_hangul (int byte) { + if (byte >= 0x84 && byte < 0xd4) + return byte-0x84; + else + return -1; +} +static int byte_col_johab_hangul (int byte) { + if (byte >= 0x41 && byte < 0x7f) + return byte-0x41; + else if (byte >= 0x81 && byte < 0xff) + return byte-0x43; + else + return -1; +} + +static void do_johab_hangul (const char* name) +{ + Encoding enc; + + enc.rows = 80; + enc.cols = 188; + enc.row_byte = row_byte_johab_hangul; + enc.col_byte = col_byte_johab_hangul; + enc.byte_row = byte_row_johab_hangul; + enc.byte_col = byte_col_johab_hangul; + enc.check_row_expr = "%1$s >= 0x84 && %1$s < 0xd4"; + enc.check_col_expr = "(%1$s >= 0x41 && %1$s < 0x7f) || (%1$s >= 0x81 && %1$s < 0xff)"; + enc.byte_row_expr = "%1$s - 0x84"; + enc.byte_col_expr = "%1$s - (%1$s >= 0x81 ? 0x43 : 0x41)"; + + read_table(&enc); + output_charset2uni(name,&enc); + invert(&enc); output_uni2charset_dense(name,&enc); +} + +/* SJIS specifics */ + +static int row_byte_sjis (int row) { + return (row >= 0x1f ? 0xc1 : 0x81) + row; +} +static int col_byte_sjis (int col) { + return (col >= 0x3f ? 0x41 : 0x40) + col; +} +static int byte_row_sjis (int byte) { + if (byte >= 0x81 && byte < 0xa0) + return byte-0x81; + else if (byte >= 0xe0) + return byte-0xc1; + else + return -1; +} +static int byte_col_sjis (int byte) { + if (byte >= 0x40 && byte < 0x7f) + return byte-0x40; + else if (byte >= 0x80 && byte < 0xfd) + return byte-0x41; + else + return -1; +} + +static void do_sjis (const char* name) +{ + Encoding enc; + + enc.rows = 94; + enc.cols = 188; + enc.row_byte = row_byte_sjis; + enc.col_byte = col_byte_sjis; + enc.byte_row = byte_row_sjis; + enc.byte_col = byte_col_sjis; + enc.check_row_expr = "(%1$s >= 0x81 && %1$s < 0xa0) || (%1$s >= 0xe0)"; + enc.check_col_expr = "(%1$s >= 0x40 && %1$s < 0x7f) || (%1$s >= 0x80 && %1$s < 0xfd)"; + enc.byte_row_expr = "%1$s - (%1$s >= 0xe0 ? 0xc1 : 0x81)"; + enc.byte_col_expr = "%1$s - (%1$s >= 0x80 ? 0x41 : 0x40)"; + + read_table(&enc); + output_charset2uni(name,&enc); + invert(&enc); output_uni2charset_sparse(name,&enc); +} + +/* Main program */ + +int main (int argc, char *argv[]) +{ + const char* charsetname; + const char* name; + + if (argc != 3) + exit(1); + charsetname = argv[1]; + name = argv[2]; + + output_title(charsetname); + + if (!strcmp(name,"gb2312") || !strcmp(name,"gb12345ext") + || !strcmp(name,"jisx0208") || !strcmp(name,"jisx0212")) + do_normal(name); + else if (!strcmp(name,"cns11643_1") || !strcmp(name,"cns11643_2") + || !strcmp(name,"cns11643_3")) + do_normal_only_charset2uni(name); + else if (!strcmp(name,"cns11643_inv")) + do_cns11643_only_uni2charset(name); + else if (!strcmp(name,"gbkext1")) + do_gbk1_only_charset2uni(name); + else if (!strcmp(name,"gbkext2")) + do_gbk2_only_charset2uni(name); + else if (!strcmp(name,"gbkext_inv")) + do_gbk1_only_uni2charset(name); + else if (!strcmp(name,"cp936ext")) + do_gbk1(name); + else if (!strcmp(name,"ksc5601")) + do_ksc5601(name); + else if (!strcmp(name,"big5") || !strcmp(name,"cp950ext")) + do_big5(name); + else if (!strcmp(name,"big5hkscs")) + do_big5hkscs(name); + else if (!strcmp(name,"johab_hangul")) + do_johab_hangul(name); + else if (!strcmp(name,"cp932ext")) + do_sjis(name); + else + exit(1); + + return 0; +} -- cgit v1.2.3