From d730fb4d646ac74ee6bbd227411329090fb3206c Mon Sep 17 00:00:00 2001 From: Manuel Novoa III Date: Thu, 23 May 2002 20:33:53 +0000 Subject: Multibyte and wide char conversion functions. Some work still to do, but they're quite solid now and Erik needs them for the gcc port. Comments at the head of wchar.c. --- libc/misc/wchar/Makefile | 47 +++ libc/misc/wchar/wchar.c | 723 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 770 insertions(+) create mode 100644 libc/misc/wchar/Makefile create mode 100644 libc/misc/wchar/wchar.c (limited to 'libc/misc/wchar') diff --git a/libc/misc/wchar/Makefile b/libc/misc/wchar/Makefile new file mode 100644 index 000000000..23a1e9bba --- /dev/null +++ b/libc/misc/wchar/Makefile @@ -0,0 +1,47 @@ +# Makefile for uClibc +# +# Copyright (C) 2000 by Lineo, inc. +# Copyright (C) 2000,2001 Erik Andersen +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU Library General Public License as published by the Free +# Software Foundation; either version 2 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more +# details. +# +# You should have received a copy of the GNU Library General Public License +# along with this program; if not, write to the Free Software Foundation, Inc., +# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Derived in part from the Linux-8086 C library, the GNU C Library, and several +# other sundry sources. Files within this library are copyright by their +# respective copyright holders. + +TOPDIR=../../../ +include $(TOPDIR)Rules.mak + +MSRC= wchar.c +MOBJ= btowc.o wctob.o mbsinit.o mbrlen.o mbrtowc.o wcrtomb.o mbsrtowcs.o \ + wcsrtombs.o _wchar_utf8sntowcs.o _wchar_wcstoutf8s.o \ + __mbsnrtowcs.o __wcsnrtombs.o + +OBJS=$(MOBJ) + +all: $(OBJS) $(LIBC) + +$(LIBC): ar-target + +ar-target: $(OBJS) + $(AR) $(ARFLAGS) $(LIBC) $(OBJS) + +$(MOBJ): $(MSRC) + $(CC) $(CFLAGS) -DL_$* $< -c -o $*.o + $(STRIPTOOL) -x -R .note -R .comment $*.o + +clean: + rm -f *.[oa] *~ core + diff --git a/libc/misc/wchar/wchar.c b/libc/misc/wchar/wchar.c new file mode 100644 index 000000000..f2d9f4a7d --- /dev/null +++ b/libc/misc/wchar/wchar.c @@ -0,0 +1,723 @@ +/* Copyright (C) 2002 Manuel Novoa III + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +/* ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! + * + * Besides uClibc, I'm using this code in my libc for elks, which is + * a 16-bit environment with a fairly limited compiler. It would make + * things much easier for me if this file isn't modified unnecessarily. + * In particular, please put any new or replacement functions somewhere + * else, and modify the makefile to use your version instead. + * Thanks. Manuel + * + * ATTENTION! ATTENTION! ATTENTION! ATTENTION! ATTENTION! */ + + +/* May 23, 2002 Initial Notes: + * + * I'm still tweaking this stuff, but it passes the tests I've thrown + * at it, and Erik needs it for the gcc port. The glibc extension + * __wcsnrtombs() hasn't been tested, as I didn't find a test for it + * in the glibc source. I also need to fix the behavior of + * _wchar_utf8sntowcs() if the max number of wchars to convert is 0. + * + * UTF-8 -> wchar -> UTF-8 conversion tests on Markus Kuhn's UTF-8-demo.txt + * file on my platform (x86) show about 5-10% faster conversion speed than + * glibc with mbsrtowcs()/wcsrtombs() and almost twice as fast as glibc with + * individual mbrtowc()/wcrtomb() calls. + * + * If 'DECODER' is defined, then _wchar_utf8sntowcs() will be compiled + * as a fail-safe UTF-8 decoder appropriate for a terminal, etc. which + * needs to deal gracefully with whatever is sent to it. In that mode, + * it passes Markus Kuhn's UTF-8-test.txt stress test. I plan to add + * an arg to force that behavior, so the interface will be changing. + * + * I need to fix the error checking for 16-bit wide chars. This isn't + * an issue for uClibc, but may be for ELKS. I'm currently not sure + * if I'll use 16-bit, 32-bit, or configureable wchars in ELKS. + * + * Manuel + */ + +#define _GNU_SOURCE +#define _ISOC99_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ENCODING (__global_locale.encoding) + +#if WCHAR_MAX > 0xffffU +#define UTF_8_MAX_LEN 6 +#else +#define UTF_8_MAX_LEN 3 +#endif + +/* #define KUHN */ + +#warning implement __CTYPE_HAS_UTF_8_LOCALES! +#define __CTYPE_HAS_UTF_8_LOCALES + +/* Implementation-specific work functions. */ + +extern size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn, + const char **__restrict src, size_t n, + mbstate_t *ps, int allow_continuation); + +extern size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n, + const wchar_t **__restrict src, size_t wn); + +/* glibc extensions. */ + +extern size_t __mbsnrtowcs(wchar_t *__restrict dst, + const char **__restrict src, + size_t NMC, size_t len, mbstate_t *__restrict ps); + +extern size_t __wcsnrtombs(char *__restrict dst, + const wchar_t **__restrict src, + size_t NWC, size_t len, mbstate_t *__restrict ps); + +/**********************************************************************/ +#ifdef L_btowc + +wint_t btowc(int c) +{ + wchar_t wc; + unsigned char buf[1]; + mbstate_t mbstate; + + if (c != EOF) { + *buf = (unsigned char) c; + mbstate.mask = 0; /* Initialize the mbstate. */ + if (mbrtowc(&wc, buf, 1, &mbstate) == 1) { + return wc; + } + } + return WEOF; +} + +#endif +/**********************************************************************/ +#ifdef L_wctob + +/* Note: We completely ignore ps in all currently supported conversions. */ + +int wctob(wint_t c) +{ + unsigned char buf[MB_LEN_MAX]; + + return (wcrtomb(buf, c, NULL) == 1) ? *buf : EOF; +} + +#endif +/**********************************************************************/ +#ifdef L_mbsinit + +int mbsinit(const mbstate_t *ps) +{ + return !ps || !ps->mask; +} + +#endif +/**********************************************************************/ +#ifdef L_mbrlen + +size_t mbrlen(const char *__restrict s, size_t n, mbstate_t *__restrict ps) +{ + static mbstate_t mbstate; /* Rely on bss 0-init. */ + + return mbrtowc(NULL, s, n, (ps != NULL) ? ps : &mbstate); +} + +#endif +/**********************************************************************/ +#ifdef L_mbrtowc + +size_t mbrtowc(wchar_t *__restrict pwc, const char *__restrict s, + size_t n, mbstate_t *__restrict ps) +{ + static mbstate_t mbstate; /* Rely on bss 0-init. */ + wchar_t wcbuf[1]; + const char *p; + size_t r; + char empty_string[1]; /* Avoid static to be fPIC friendly. */ + + if (!ps) { + ps = &mbstate; + } + + if (!s) { + pwc = (wchar_t *) s; /* NULL */ + empty_string[0] = 0; /* Init the empty string when necessary. */ + s = empty_string; + n = 1; + } else if (!n) { + return (ps->mask && (ps->wc == 0xffffU)) /* TODO: change error code? */ + ? ((size_t) -1) : ((size_t) -2); + } + + p = s; + +#ifdef __CTYPE_HAS_UTF_8_LOCALES + /* Need to do this here since mbsrtowcs doesn't allow incompletes. */ + if (ENCODING == __ctype_encoding_utf8) { + r = _wchar_utf8sntowcs(pwc, 1, &p, n, ps, 1); + return (r == 1) ? (p-s) : r; + } +#endif + + r = __mbsnrtowcs(wcbuf, &p, SIZE_MAX, 1, ps); + + if (((ssize_t) r) >= 0) { + if (pwc) { + *pwc = *wcbuf; + } + } + return (size_t) r; +} + +#endif +/**********************************************************************/ +#ifdef L_wcrtomb + +/* Note: We completely ignore ps in all currently supported conversions. */ +/* TODO: Check for valid state anyway? */ + +size_t wcrtomb(register char *__restrict s, wchar_t wc, + mbstate_t *__restrict ps) +{ + wchar_t wcbuf[2]; + const wchar_t *pwc; + size_t r; + char buf[MB_LEN_MAX]; + + if (!s) { + s = buf; + wc = 0; + } + + pwc = wcbuf; + wcbuf[0] = wc; + wcbuf[1] = 0; + + r = __wcsnrtombs(s, &pwc, SIZE_MAX, MB_LEN_MAX, ps); + return (r != 0) ? r : 1; +} + +#endif +/**********************************************************************/ +#ifdef L_mbsrtowcs + +size_t mbsrtowcs(wchar_t *__restrict dst, const char **__restrict src, + size_t len, mbstate_t *__restrict ps) +{ + static mbstate_t mbstate; /* Rely on bss 0-init. */ + + return __mbsnrtowcs(dst, src, SIZE_MAX, len, + ((ps != NULL) ? ps : &mbstate)); +} + +#endif +/**********************************************************************/ +#ifdef L_wcsrtombs + +/* Note: We completely ignore ps in all currently supported conversions. + + * TODO: Check for valid state anyway? */ + +size_t wcsrtombs(char *__restrict dst, const wchar_t **__restrict src, + size_t len, mbstate_t *__restrict ps) +{ + return __wcsnrtombs(dst, src, SIZE_MAX, len, ps); +} + +#endif +/**********************************************************************/ +#ifdef L__wchar_utf8sntowcs + +/* Define DECODER to generate a UTF-8 decoder which passes Markus Kuhn's + * UTF-8-test.txt strss test. + */ +/* #define DECODER */ + +#ifdef DECODER +#ifndef KUHN +#define KUHN +#endif +#endif + +size_t _wchar_utf8sntowcs(wchar_t *__restrict pwc, size_t wn, + const char **__restrict src, size_t n, + mbstate_t *ps, int allow_continuation) +{ + register const char *s; + __uwchar_t mask; + __uwchar_t wc; + wchar_t wcbuf[1]; + size_t count; + int incr; + + s = *src; + + assert(s != NULL); + assert(ps != NULL); + + incr = 1; + if (!pwc) { + pwc = wcbuf; + wn = SIZE_MAX; + incr = 0; + } +#warning fix _wchar_utf8sntowcs to allow wn == 0! + assert(wn > 0); /* TODO: fix this!! */ + + count = wn; + + if ((mask = (__uwchar_t) ps->mask) != 0) { /* A continuation... */ +#ifdef DECODER + wc = (__uwchar_t) ps->wc; + if (n) { + goto CONTINUE; + } + goto DONE; +#else + if ((wc = (__uwchar_t) ps->wc) != 0xffffU) { + /* TODO: change error code here and below? */ + if (n) { + goto CONTINUE; + } + goto DONE; + } + return (size_t) -1; /* We're in an error state. */ +#endif + } + + do { + if (!n) { + goto DONE; + } + --n; + if ((wc = ((unsigned char) *s++)) >= 0x80) { /* Not ASCII... */ + mask = 0x40; +#warning fix range for 16 bit wides + if ( ((unsigned char)(s[-1] - 0xc0)) < (0xfe - 0xc0) ) { + goto START; + } + BAD: +#ifdef DECODER + wc = 0xfffd; + goto COMPLETE; +#else + ps->mask = mask; + ps->wc = 0xffffU; + return (size_t) -1; /* Illegal start byte! */ +#endif + + CONTINUE: + while (n) { + --n; + if ((*s & 0xc0) != 0x80) { + goto BAD; + } + mask <<= 5; + wc <<= 6; + wc += (*s & 0x3f); /* keep seperate for bcc (smaller code) */ + ++s; + START: + wc &= ~(mask << 1); + + if ((wc & mask) == 0) { /* Character completed. */ + if ((mask >>= 5) == 0x40) { + mask += mask; + } + /* Check for invalid sequences (longer than necessary) + * and invalid chars. */ + if ( (wc < mask) /* Sequence not minimal length. */ +#ifdef KUHN +#if UTF_8_MAX_LEN == 3 +#error broken since mask can overflow!! + /* For plane 0, these are the only defined values.*/ + || (wc > 0xfffdU) +#else + /* Note that we don't need to worry about exceeding */ + /* 31 bits as that is the most that UTF-8 provides. */ + || ( ((__uwchar_t)(wc - 0xfffeU)) < 2) +#endif + || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) ) +#endif /* KUHN */ + ) { + goto BAD; + } + goto COMPLETE; + } + } + /* Character potentially valid but incomplete. */ + if (!allow_continuation) { + if (count != wn) { + return 0; + } + /* NOTE: The following can fail if you allow and then disallow + * continuation!!! */ +#if UTF_8_MAX_LEN == 3 +#error broken since mask can overflow!! +#endif + /* Need to back up... */ + do { + --s; + } while ((mask >>= 5) >= 0x40); + goto DONE; + } + ps->mask = (wchar_t) mask; + ps->wc = (wchar_t) wc; + *src = s; + return (size_t) -2; + } + COMPLETE: + *pwc = wc; + pwc += incr; + + } +#ifdef DECODER + while (--count); +#else + while (wc && --count); + + if (!wc) { + s = NULL; + } +#endif + + DONE: + /* ps->wc is irrelavent here. */ + ps->mask = 0; + if (pwc != wcbuf) { + *src = s; + } + + return wn - count; +} + +#endif +/**********************************************************************/ +#ifdef L__wchar_wcstoutf8s + +size_t _wchar_wcsntoutf8s(char *__restrict s, size_t n, + const wchar_t **__restrict src, size_t wn) +{ + register char *p; + size_t len, t; + __uwchar_t wc; + const __uwchar_t *swc; + int store; + char buf[MB_LEN_MAX]; + char m; + + store = 1; + if (!s) { + s = buf; + n = SIZE_MAX; + store = 0; + } + + t = n; + swc = (const __uwchar_t *) *src; + + assert(swc != NULL); + + while (wn && t) { + wc = *swc; + + *s = wc; + len = 1; + + if (wc >= 0x80) { +#ifdef KUHN + if ( +#if UTF_8_MAX_LEN == 3 + /* For plane 0, these are the only defined values.*/ + /* Note that we don't need to worry about exceeding */ + /* 31 bits as that is the most that UTF-8 provides. */ + (wc > 0xfffdU) +#else + /* UTF_8_MAX_LEN == 6 */ + (wc > 0x7fffffffUL) + || ( ((__uwchar_t)(wc - 0xfffeU)) < 2) +#endif + || ( ((__uwchar_t)(wc - 0xd800U)) < (0xe000U - 0xd800U) ) + ) { + return (size_t) -1; + } +#else /* KUHN */ +#if UTF_8_MAX_LEN != 3 + if (wc > 0x7fffffffUL) { /* Value too large. */ + return (size_t) -1; + } +#endif +#endif /* KUHN */ + + wc >>= 1; + p = s; + do { + ++p; + } while (wc >>= 5); + wc = *swc; + if ((len = p - s) > t) { /* Not enough space. */ + break; + } + + m = 0x80; + while( p>s ) { + m = (m >> 1) | 0x80; + *--p = (wc & 0x3f) | 0x80; + wc >>= 6; + } + *s |= (m << 1); + } else if (wc == 0) { /* End of string. */ + swc = NULL; + break; + } + + ++swc; + --wn; + t -= len; + if (store) { + s += len; + } + } + + *src = (const wchar_t *) swc; + + return n - t; +} + + +#endif +/**********************************************************************/ +#ifdef L___mbsnrtowcs + +/* WARNING: We treat len as SIZE_MAX when dst is NULL! */ + +size_t mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src, + size_t NMC, size_t len, mbstate_t *__restrict ps) + __attribute__ ((__weak__, __alias__("__mbsnrtowcs"))); + +size_t __mbsnrtowcs(wchar_t *__restrict dst, const char **__restrict src, + size_t NMC, size_t len, mbstate_t *__restrict ps) +{ + static mbstate_t mbstate; /* Rely on bss 0-init. */ + wchar_t wcbuf[1]; + const char *s; + size_t count, r; + int incr; + + if (!ps) { + ps = &mbstate; + } + +#ifdef __CTYPE_HAS_UTF_8_LOCALES + if (ENCODING == __ctype_encoding_utf8) { + return ((r = _wchar_utf8sntowcs(dst, len, src, NMC, ps, 1)) + != (size_t) -2) ? r : 0; + } +#endif + incr = 1; + if (!dst) { + dst = wcbuf; + len = SIZE_MAX; + incr = 0; + } + + /* Since all the following encodings are single-byte encodings... */ + if (len > NMC) { + len = NMC; + } + + count = len; + s = *src; + +#ifdef __CTYPE_HAS_8_BIT_LOCALES + if (ENCODING == __ctype_encoding_8_bit) { + wchar_t wc; + while (count) { + if ((wc = ((unsigned char)(*s))) >= 0x80) { /* Non-ASCII... */ + wc -= 0x80; + wc = __global_locale.tbl8c2wc[ + (__global_locale.idx8c2wc[wc >> Cc2wc_IDX_SHIFT] + << Cc2wc_IDX_SHIFT) + (wc & (Cc2wc_ROW_LEN - 1))]; + if (!wc) { + goto BAD; + } + } else if (!wc) { + s = NULL; + break; + } + ++s; + *dst = wc; + dst += incr; + --count; + } + if (dst != wcbuf) { + *src = s; + } + return len - count; + } +#endif + + assert(ENCODING == __ctype_encoding_7_bit); + + while (count) { + if ((*dst = (unsigned char) *s) == 0) { + s = NULL; + break; + } + if (*dst >= 0x80) { +#ifdef __CTYPE_HAS_8_BIT_LOCALES + BAD: +#endif + __set_errno(EILSEQ); + return (size_t) -1; + } + ++s; + dst += incr; + --count; + } + if (dst != wcbuf) { + *src = s; + } + return len - count; +} + +#endif +/**********************************************************************/ +#ifdef L___wcsnrtombs + +/* WARNING: We treat len as SIZE_MAX when dst is NULL! */ + +/* Note: We completely ignore ps in all currently supported conversions. + * TODO: Check for valid state anyway? */ + +size_t wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src, + size_t NWC, size_t len, mbstate_t *__restrict ps) + __attribute__ ((__weak__, __alias__("__wcsnrtombs"))); + +size_t __wcsnrtombs(char *__restrict dst, const wchar_t **__restrict src, + size_t NWC, size_t len, mbstate_t *__restrict ps) +{ + const __uwchar_t *s; + size_t count; + int incr; + char buf[MB_LEN_MAX]; + +#ifdef __CTYPE_HAS_UTF_8_LOCALES + if (ENCODING == __ctype_encoding_utf8) { + return _wchar_wcsntoutf8s(dst, len, src, NWC); + } +#endif + + incr = 1; + if (!dst) { + dst = buf; + len = SIZE_MAX; + incr = 0; + } + + /* Since all the following encodings are single-byte encodings... */ + if (len > NWC) { + len = NWC; + } + + count = len; + s = (const __uwchar_t *) *src; + +#ifdef __CTYPE_HAS_8_BIT_LOCALES + if (ENCODING == __ctype_encoding_8_bit) { + __uwchar_t wc; + __uwchar_t u; + while (count) { + if ((wc = *s) <= 0x7f) { + if (!(*dst = (unsigned char) wc)) { + s = NULL; + break; + } + } else { + u = 0; + if (wc <= Cwc2c_DOMAIN_MAX) { + u = __global_locale.idx8wc2c[wc >> (Cwc2c_TI_SHIFT + + Cwc2c_TT_SHIFT)]; + u = __global_locale.tbl8wc2c[(u << Cwc2c_TI_SHIFT) + + ((wc >> Cwc2c_TT_SHIFT) + & ((1 << Cwc2c_TI_SHIFT)-1))]; + u = __global_locale.tbl8wc2c[Cwc2c_TI_LEN + + (u << Cwc2c_TT_SHIFT) + + (wc & ((1 << Cwc2c_TT_SHIFT)-1))]; + } + +/* #define __WCHAR_REPLACEMENT_CHAR '?' */ +#ifdef __WCHAR_REPLACEMENT_CHAR + *dst = (unsigned char) ( u ? u : __WCHAR_REPLACEMENT_CHAR ); +#else + if (!u) { + goto BAD; + } + *dst = (unsigned char) u; +#endif + } + ++s; + dst += incr; + --count; + } + if (dst != buf) { + *src = (const wchar_t *) s; + } + return len - count; + } +#endif + + assert(ENCODING == __ctype_encoding_7_bit); + + while (count) { + if (*s >= 0x80) { +#if defined(__CTYPE_HAS_8_BIT_LOCALES) && !defined(__WCHAR_REPLACEMENT_CHAR) + BAD: +#endif + __set_errno(EILSEQ); + return (size_t) -1; + + } + if ((*dst = (unsigned char) *s) == 0) { + s = NULL; + break; + } + ++s; + dst += incr; + --count; + } + if (dst != buf) { + *src = (const wchar_t *) s; + } + return len - count; +} + +#endif +/**********************************************************************/ -- cgit v1.2.3