diff options
Diffstat (limited to 'libiconv/iconv.c')
-rw-r--r-- | libiconv/iconv.c | 440 |
1 files changed, 440 insertions, 0 deletions
diff --git a/libiconv/iconv.c b/libiconv/iconv.c new file mode 100644 index 000000000..cb4e94775 --- /dev/null +++ b/libiconv/iconv.c @@ -0,0 +1,440 @@ +#include <iconv.h> +#include <errno.h> +#include <wchar.h> +#include <string.h> +#include <strings.h> +#include <stdlib.h> +#include <limits.h> + +#include <dirent.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <unistd.h> +#include <stdint.h> + +/* builtin charmaps */ +#include "charmaps.h" + +/* only 0-7 are valid as dest charset */ +#define UTF_16BE 000 +#define UTF_16LE 001 +#define UTF_32BE 002 +#define UTF_32LE 003 +#define WCHAR_T 004 +#define UTF_8 005 +#define US_ASCII 006 +#define LATIN_1 007 + +/* additional charsets with algorithmic conversion */ +#define LATIN_9 010 +#define TIS_620 011 +#define JIS_0201 012 + +/* some programs like php need this */ +int _libiconv_version = _LIBICONV_VERSION; + +/* these must match the constants above */ +static const unsigned char charsets[] = + "\005" "UTF-8" "\0" + "\004" "WCHAR_T" "\0" + "\000" "UTF-16BE" "\0" + "\001" "UTF-16LE" "\0" + "\002" "UTF-32BE" "\0" + "\003" "UTF-32LE" "\0" + "\006" "ASCII" "\0" + "\006" "US-ASCII" "\0" + "\007" "ISO-8859-1" "\0" + "\007" "LATIN1" "\0" + "\010" "ISO-8859-15""\0" + "\010" "LATIN9" "\0" + "\011" "ISO-8859-11""\0" + "\011" "TIS-620" "\0" + "\012" "JIS-0201" "\0" + "\377"; + +/* separate identifiers for sbcs/dbcs/etc map type */ +#define UCS2_8BIT 000 +#define UCS3_8BIT 001 +#define EUC 002 +#define EUC_TW 003 +#define SHIFT_JIS 004 +#define BIG5 005 +#define GBK 006 + +/* FIXME: these are not implemented yet +// EUC: A1-FE A1-FE +// GBK: 81-FE 40-7E,80-FE +// Big5: A1-FE 40-7E,A1-FE +*/ + +static const unsigned short maplen[] = { + [UCS2_8BIT] = 4+ 2* 128, + [UCS3_8BIT] = 4+ 3* 128, + [EUC] = 4+ 2* 94*94, + [SHIFT_JIS] = 4+ 2* 94*94, + [BIG5] = 4+ 2* 94*157, + [GBK] = 4+ 2* 126*190, + [EUC_TW] = 4+ 2* 2*94*94, +}; + +static int find_charmap(const char *name) +{ + int i; + for (i = 0; i < (sizeof(charmaps) / sizeof(charmaps[0])); i++) + if (!strcasecmp(charmaps[i].name, name)) + return i; + return -1; +} + +static int find_charset(const char *name) +{ + const unsigned char *s; + for (s=charsets; *s<0xff && strcasecmp(s+1, name); s+=strlen(s)+1); + return *s; +} + +iconv_t iconv_open(const char *to, const char *from) +{ + unsigned f, t; + int m; + + if ((t = find_charset(to)) > 8) + return -1; + + if ((f = find_charset(from)) < 255) + return 0 | (t<<1) | (f<<8); + + if ((m = find_charmap(from)) > -1) + return 1 | (t<<1) | (m<<8); + + return -1; +} + +int iconv_close(iconv_t cd) +{ + return 0; +} + +static inline wchar_t get_16(const unsigned char *s, int endian) +{ + endian &= 1; + return s[endian]<<8 | s[endian^1]; +} + +static inline void put_16(unsigned char *s, wchar_t c, int endian) +{ + endian &= 1; + s[endian] = c>>8; + s[endian^1] = c; +} + +static inline int utf8enc_wchar(char *outb, wchar_t c) +{ + if (c <= 0x7F) { + *outb = c; + return 1; + } + else if (c <= 0x7FF) { + *outb++ = ((c >> 6) & 0x1F) | 0xC0; + *outb++ = ( c & 0x3F) | 0x80; + return 2; + } + else if (c <= 0xFFFF) { + *outb++ = ((c >> 12) & 0x0F) | 0xE0; + *outb++ = ((c >> 6) & 0x3F) | 0x80; + *outb++ = ( c & 0x3F) | 0x80; + return 3; + } + else if (c <= 0x10FFFF) { + *outb++ = ((c >> 18) & 0x07) | 0xF0; + *outb++ = ((c >> 12) & 0x3F) | 0x80; + *outb++ = ((c >> 6) & 0x3F) | 0x80; + *outb++ = ( c & 0x3F) | 0x80; + return 4; + } + else { + *outb++ = '?'; + return 1; + } +} + +static inline int utf8seq_is_overlong(char *s, int n) +{ + switch (n) + { + case 2: + /* 1100000x (10xxxxxx) */ + return (((*s >> 1) == 0x60) && + ((*(s+1) >> 6) == 0x02)); + + case 3: + /* 11100000 100xxxxx (10xxxxxx) */ + return ((*s == 0xE0) && + ((*(s+1) >> 5) == 0x04) && + ((*(s+2) >> 6) == 0x02)); + + case 4: + /* 11110000 1000xxxx (10xxxxxx 10xxxxxx) */ + return ((*s == 0xF0) && + ((*(s+1) >> 4) == 0x08) && + ((*(s+2) >> 6) == 0x02) && + ((*(s+3) >> 6) == 0x02)); + } + + return 0; +} + +static inline int utf8seq_is_surrogate(char *s, int n) +{ + return ((n == 3) && (*s == 0xED) && (*(s+1) >= 0xA0) && (*(s+1) <= 0xBF)); +} + +static inline int utf8seq_is_illegal(char *s, int n) +{ + return ((n == 3) && (*s == 0xEF) && (*(s+1) == 0xBF) && + (*(s+2) >= 0xBE) && (*(s+2) <= 0xBF)); +} + +static inline int utf8dec_wchar(wchar_t *c, unsigned char *in, size_t inb) +{ + int i; + int n = -1; + + /* trivial char */ + if (*in <= 0x7F) { + *c = *in; + return 1; + } + + /* find utf8 sequence length */ + if ((*in & 0xE0) == 0xC0) n = 2; + else if ((*in & 0xF0) == 0xE0) n = 3; + else if ((*in & 0xF8) == 0xF0) n = 4; + else if ((*in & 0xFC) == 0xF8) n = 5; + else if ((*in & 0xFE) == 0xFC) n = 6; + + /* starved? */ + if (n > inb) + return -2; + + /* decode ... */ + if (n > 1 && n < 5) { + /* reject invalid sequences */ + if (utf8seq_is_overlong(in, n) || + utf8seq_is_surrogate(in, n) || + utf8seq_is_illegal(in, n)) + return -1; + + /* decode ... */ + *c = (char)(*in++ & (0x7F >> n)); + + for (i = 1; i < n; i++) { + /* illegal continuation byte */ + if (*in < 0x80 || *in > 0xBF) + return -1; + + *c = (*c << 6) | (*in++ & 0x3F); + } + + return n; + } + + /* unmapped sequence (> 4) */ + return -1; +} + +static inline char latin9_translit(wchar_t c) +{ + /* a number of trivial iso-8859-15 <> utf-8 transliterations */ + switch (c) { + case 0x20AC: return 0xA4; /* Euro */ + case 0x0160: return 0xA6; /* S caron */ + case 0x0161: return 0xA8; /* s caron */ + case 0x017D: return 0xB4; /* Z caron */ + case 0x017E: return 0xB8; /* z caron */ + case 0x0152: return 0xBC; /* OE */ + case 0x0153: return 0xBD; /* oe */ + case 0x0178: return 0xBE; /* Y diaeresis */ + default: return '?'; + } +} + +size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb) +{ + size_t x=0; + unsigned char to = (cd>>1)&127; + unsigned char from = 255; + const unsigned char *map = 0; + char tmp[MB_LEN_MAX]; + wchar_t c, d; + size_t k, l; + int err; + + if (!in || !*in || !*inb) return 0; + + if (cd & 1) + map = charmaps[cd>>8].map; + else + from = cd>>8; + + for (; *inb; *in+=l, *inb-=l) { + c = *(unsigned char *)*in; + l = 1; + if (from >= UTF_8 && c < 0x80) goto charok; + switch (from) { + case WCHAR_T: + l = sizeof(wchar_t); + if (*inb < l) goto starved; + c = *(wchar_t *)*in; + break; + case UTF_8: + l = utf8dec_wchar(&c, *in, *inb); + if (!l) l++; + else if (l == (size_t)-1) goto ilseq; + else if (l == (size_t)-2) goto starved; + break; + case US_ASCII: + goto ilseq; + case LATIN_9: + if ((unsigned)c - 0xa4 <= 0xbe - 0xa4) { + static const unsigned char map[] = { + 0, 0x60, 0, 0x61, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0x7d, 0, 0, 0, 0x7e, 0, 0, 0, + 0x52, 0x53, 0x78 + }; + if (c == 0xa4) c = 0x20ac; + else if (map[c-0xa5]) c = 0x100 | map[c-0xa5]; + } + case LATIN_1: + goto charok; + case TIS_620: + if (c >= 0xa1) c += 0x0e01-0xa1; + goto charok; + case JIS_0201: + if (c >= 0xa1) { + if (c <= 0xdf) c += 0xff61-0xa1; + else goto ilseq; + } + goto charok; + case UTF_16BE: + case UTF_16LE: + l = 2; + if (*inb < 2) goto starved; + c = get_16(*in, from); + if ((unsigned)(c-0xdc00) < 0x400) goto ilseq; + if ((unsigned)(c-0xd800) < 0x400) { + l = 4; + if (*inb < 4) goto starved; + d = get_16(*in + 2, from); + if ((unsigned)(c-0xdc00) >= 0x400) goto ilseq; + c = ((c-0xd800)<<10) | (d-0xdc00); + } + break; + case UTF_32BE: + case UTF_32LE: + l = 4; + if (*inb < 4) goto starved; + // FIXME + // c = get_32(*in, from); + break; + default: + /* only support ascii supersets */ + if (c < 0x80) break; + switch (map[0]) { + case UCS2_8BIT: + c -= 0x80; + break; + case EUC: + if ((unsigned)c - 0xa1 >= 94) goto ilseq; + if ((unsigned)in[0][1] - 0xa1 >= 94) goto ilseq; + c = (c-0xa1)*94 + (in[0][1]-0xa1); + l = 2; + break; + case SHIFT_JIS: + if ((unsigned)c - 0xa1 <= 0xdf-0xa1) { + c += 0xff61-0xa1; + goto charok; + } + // FIXME... + l = 2; + break; + default: + goto badf; + } + c = get_16(map + 4 + 2*c, 0); + if (c == 0xffff) goto ilseq; + goto charok; + } + + if ((unsigned)c - 0xd800 < 0x800 || (unsigned)c >= 0x110000) + goto ilseq; +charok: + switch (to) { + case WCHAR_T: + if (*outb < sizeof(wchar_t)) goto toobig; + *(wchar_t *)*out = c; + *out += sizeof(wchar_t); + *outb -= sizeof(wchar_t); + break; + case UTF_8: + if (*outb < 4) { + k = utf8enc_wchar(tmp, c); + if (*outb < k) goto toobig; + memcpy(*out, tmp, k); + } else k = utf8enc_wchar(*out, c); + *out += k; + *outb -= k; + break; + case US_ASCII: + if (c > 0x7f) c = 0xfffd; + /* fall thru and count replacement in latin1 case */ + case LATIN_9: + if (c >= 0x100 && c != 0xfffd) + c = latin9_translit(c); + /* fall through */ + case LATIN_1: + if (!*outb) goto toobig; + if (c < 0x100) **out = c; + else x++, **out = '*'; //FIXME: translit? + ++*out; + --*outb; + break; + case UTF_16BE: + case UTF_16LE: + if (c < 0x10000) { + if (*outb < 2) goto toobig; + put_16(*out, c, to); + *out += 2; + *outb -= 2; + break; + } + if (*outb < 4) goto toobig; + put_16(*out, (c>>10)|0xd800, to); + put_16(*out + 2, (c&0x3ff)|0xdc00, to); + *out += 4; + *outb -= 4; + break; + default: + goto badf; + } + } + return x; +ilseq: + err = EILSEQ; + x = -1; + goto end; +badf: + err = EBADF; + x = -1; + goto end; +toobig: + err = E2BIG; + x = -1; + goto end; +starved: + err = EINVAL; +end: + errno = err; + return x; +} |