diff options
Diffstat (limited to 'libc')
-rw-r--r-- | libc/misc/regex/Makefile.in | 4 | ||||
-rw-r--r-- | libc/misc/regex/_regex.h | 45 | ||||
-rw-r--r-- | libc/misc/regex/regcomp.c | 3802 | ||||
-rw-r--r-- | libc/misc/regex/regex.c | 105 | ||||
-rw-r--r-- | libc/misc/regex/regex_internal.c | 1645 | ||||
-rw-r--r-- | libc/misc/regex/regex_internal.h | 755 | ||||
-rw-r--r-- | libc/misc/regex/regexec.c | 4335 |
7 files changed, 10691 insertions, 0 deletions
diff --git a/libc/misc/regex/Makefile.in b/libc/misc/regex/Makefile.in index 5c86e874d..9cd403add 100644 --- a/libc/misc/regex/Makefile.in +++ b/libc/misc/regex/Makefile.in @@ -6,7 +6,11 @@ # Licensed under the LGPL v2.1, see the file COPYING.LIB in this tarball. # +ifeq ($(UCLIBC_HAS_REGEX_OLD),y) CSRC:=regex_old.c +else +CSRC:=regex.c +endif MISC_REGEX_DIR:=$(top_srcdir)libc/misc/regex MISC_REGEX_OUT:=$(top_builddir)libc/misc/regex diff --git a/libc/misc/regex/_regex.h b/libc/misc/regex/_regex.h new file mode 100644 index 000000000..cf3a5a2c1 --- /dev/null +++ b/libc/misc/regex/_regex.h @@ -0,0 +1,45 @@ +/* this file is copied from libc/include/regex.h */ + +#ifndef _REGEX_H +#include <regex.h> + +/* Document internal interfaces. */ +extern reg_syntax_t __re_set_syntax (reg_syntax_t syntax); + +extern const char *__re_compile_pattern (const char *pattern, size_t length, + struct re_pattern_buffer *buffer); + +extern int __re_compile_fastmap (struct re_pattern_buffer *buffer); + +extern int __re_search (struct re_pattern_buffer *buffer, const char *string, + int length, int start, int range, + struct re_registers *regs); + +extern int __re_search_2 + (struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, int range, struct re_registers *regs, int stop); + +extern int __re_match + (struct re_pattern_buffer *buffer, const char *string, + int length, int start, struct re_registers *regs); + +extern int __re_match_2 + (struct re_pattern_buffer *buffer, const char *string1, + int length1, const char *string2, int length2, + int start, struct re_registers *regs, int stop); + +extern void __re_set_registers + (struct re_pattern_buffer *buffer, struct re_registers *regs, + unsigned num_regs, regoff_t *starts, regoff_t *ends); + +extern int __regcomp (regex_t *__preg, const char *__pattern, int __cflags); + +extern int __regexec (const regex_t *__preg, const char *__string, + size_t __nmatch, regmatch_t __pmatch[], int __eflags); + +extern size_t __regerror (int __errcode, const regex_t *__preg, + char *__errbuf, size_t __errbuf_size); + +extern void __regfree (regex_t *__preg); +#endif diff --git a/libc/misc/regex/regcomp.c b/libc/misc/regex/regcomp.c new file mode 100644 index 000000000..83df61108 --- /dev/null +++ b/libc/misc/regex/regcomp.c @@ -0,0 +1,3802 @@ +/* Extended regular expression matching and search library. + Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern, + size_t length, reg_syntax_t syntax); +static void re_compile_fastmap_iter (regex_t *bufp, + const re_dfastate_t *init_state, + char *fastmap); +static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len); +#ifdef RE_ENABLE_I18N +static void free_charset (re_charset_t *cset); +#endif /* RE_ENABLE_I18N */ +static void free_workarea_compile (regex_t *preg); +static reg_errcode_t create_initial_state (re_dfa_t *dfa); +#ifdef RE_ENABLE_I18N +static void optimize_utf8 (re_dfa_t *dfa); +#endif +static reg_errcode_t analyze (regex_t *preg); +static reg_errcode_t preorder (bin_tree_t *root, + reg_errcode_t (fn (void *, bin_tree_t *)), + void *extra); +static reg_errcode_t postorder (bin_tree_t *root, + reg_errcode_t (fn (void *, bin_tree_t *)), + void *extra); +static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node); +static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node); +static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg, + bin_tree_t *node); +static reg_errcode_t calc_first (void *extra, bin_tree_t *node); +static reg_errcode_t calc_next (void *extra, bin_tree_t *node); +static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node); +static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint); +static int search_duplicated_node (const re_dfa_t *dfa, int org_node, + unsigned int constraint); +static reg_errcode_t calc_eclosure (re_dfa_t *dfa); +static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, + int node, int root); +static reg_errcode_t calc_inveclosure (re_dfa_t *dfa); +static int fetch_number (re_string_t *input, re_token_t *token, + reg_syntax_t syntax); +static int peek_token (re_token_t *token, re_string_t *input, + reg_syntax_t syntax) internal_function; +static bin_tree_t *parse (re_string_t *regexp, regex_t *preg, + reg_syntax_t syntax, reg_errcode_t *err); +static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg, + re_token_t *token, reg_syntax_t syntax, + int nest, reg_errcode_t *err); +static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg, + re_token_t *token, reg_syntax_t syntax, + int nest, reg_errcode_t *err); +static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg, + re_token_t *token, reg_syntax_t syntax, + int nest, reg_errcode_t *err); +static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg, + re_token_t *token, reg_syntax_t syntax, + int nest, reg_errcode_t *err); +static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp, + re_dfa_t *dfa, re_token_t *token, + reg_syntax_t syntax, reg_errcode_t *err); +static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, + re_token_t *token, reg_syntax_t syntax, + reg_errcode_t *err); +static reg_errcode_t parse_bracket_element (bracket_elem_t *elem, + re_string_t *regexp, + re_token_t *token, int token_len, + re_dfa_t *dfa, + reg_syntax_t syntax, + int accept_hyphen); +static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem, + re_string_t *regexp, + re_token_t *token); +#ifdef RE_ENABLE_I18N +static reg_errcode_t build_equiv_class (bitset_t sbcset, + re_charset_t *mbcset, + int *equiv_class_alloc, + const unsigned char *name); +static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, + bitset_t sbcset, + re_charset_t *mbcset, + int *char_class_alloc, + const unsigned char *class_name, + reg_syntax_t syntax); +#else /* not RE_ENABLE_I18N */ +static reg_errcode_t build_equiv_class (bitset_t sbcset, + const unsigned char *name); +static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans, + bitset_t sbcset, + const unsigned char *class_name, + reg_syntax_t syntax); +#endif /* not RE_ENABLE_I18N */ +static bin_tree_t *build_charclass_op (re_dfa_t *dfa, + RE_TRANSLATE_TYPE trans, + const unsigned char *class_name, + const unsigned char *extra, + int non_match, reg_errcode_t *err); +static bin_tree_t *create_tree (re_dfa_t *dfa, + bin_tree_t *left, bin_tree_t *right, + re_token_type_t type); +static bin_tree_t *create_token_tree (re_dfa_t *dfa, + bin_tree_t *left, bin_tree_t *right, + const re_token_t *token); +static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa); +static void free_token (re_token_t *node); +static reg_errcode_t free_tree (void *extra, bin_tree_t *node); +static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node); + +/* This table gives an error message for each of the error codes listed + in regex.h. Obviously the order here has to be same as there. + POSIX doesn't require that we do anything for REG_NOERROR, + but why not be nice? */ + +const char __re_error_msgid[] attribute_hidden = + { +#define REG_NOERROR_IDX 0 + gettext_noop ("Success") /* REG_NOERROR */ + "\0" +#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success") + gettext_noop ("No match") /* REG_NOMATCH */ + "\0" +#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match") + gettext_noop ("Invalid regular expression") /* REG_BADPAT */ + "\0" +#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression") + gettext_noop ("Invalid collation character") /* REG_ECOLLATE */ + "\0" +#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character") + gettext_noop ("Invalid character class name") /* REG_ECTYPE */ + "\0" +#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name") + gettext_noop ("Trailing backslash") /* REG_EESCAPE */ + "\0" +#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash") + gettext_noop ("Invalid back reference") /* REG_ESUBREG */ + "\0" +#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference") + gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */ + "\0" +#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^") + gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */ + "\0" +#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(") + gettext_noop ("Unmatched \\{") /* REG_EBRACE */ + "\0" +#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{") + gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */ + "\0" +#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}") + gettext_noop ("Invalid range end") /* REG_ERANGE */ + "\0" +#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end") + gettext_noop ("Memory exhausted") /* REG_ESPACE */ + "\0" +#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted") + gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */ + "\0" +#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression") + gettext_noop ("Premature end of regular expression") /* REG_EEND */ + "\0" +#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression") + gettext_noop ("Regular expression too big") /* REG_ESIZE */ + "\0" +#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big") + gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */ + }; + +const size_t __re_error_msgid_idx[] attribute_hidden = + { + REG_NOERROR_IDX, + REG_NOMATCH_IDX, + REG_BADPAT_IDX, + REG_ECOLLATE_IDX, + REG_ECTYPE_IDX, + REG_EESCAPE_IDX, + REG_ESUBREG_IDX, + REG_EBRACK_IDX, + REG_EPAREN_IDX, + REG_EBRACE_IDX, + REG_BADBR_IDX, + REG_ERANGE_IDX, + REG_ESPACE_IDX, + REG_BADRPT_IDX, + REG_EEND_IDX, + REG_ESIZE_IDX, + REG_ERPAREN_IDX + }; + +/* Entry points for GNU code. */ + +/* re_compile_pattern is the GNU regular expression compiler: it + compiles PATTERN (of length LENGTH) and puts the result in BUFP. + Returns 0 if the pattern was valid, otherwise an error string. + + Assumes the `allocated' (and perhaps `buffer') and `translate' fields + are set in BUFP on entry. */ + +const char * +re_compile_pattern (pattern, length, bufp) + const char *pattern; + size_t length; + struct re_pattern_buffer *bufp; +{ + reg_errcode_t ret; + + /* And GNU code determines whether or not to get register information + by passing null for the REGS argument to re_match, etc., not by + setting no_sub, unless RE_NO_SUB is set. */ + bufp->no_sub = !!(re_syntax_options & RE_NO_SUB); + + /* Match anchors at newline. */ + bufp->newline_anchor = 1; + + ret = re_compile_internal (bufp, pattern, length, re_syntax_options); + + if (!ret) + return NULL; + return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__re_compile_pattern, re_compile_pattern) +#endif + +/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can + also be assigned to arbitrarily: each pattern buffer stores its own + syntax, so it can be changed between regex compilations. */ +/* This has no initializer because initialized variables in Emacs + become read-only after dumping. */ +reg_syntax_t re_syntax_options; + + +/* Specify the precise syntax of regexps for compilation. This provides + for compatibility for various utilities which historically have + different, incompatible syntaxes. + + The argument SYNTAX is a bit mask comprised of the various bits + defined in regex.h. We return the old syntax. */ + +reg_syntax_t +re_set_syntax (syntax) + reg_syntax_t syntax; +{ + reg_syntax_t ret = re_syntax_options; + + re_syntax_options = syntax; + return ret; +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__re_set_syntax, re_set_syntax) +#endif + +int +re_compile_fastmap (bufp) + struct re_pattern_buffer *bufp; +{ + re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + char *fastmap = bufp->fastmap; + + memset (fastmap, '\0', sizeof (char) * SBC_MAX); + re_compile_fastmap_iter (bufp, dfa->init_state, fastmap); + if (dfa->init_state != dfa->init_state_word) + re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap); + if (dfa->init_state != dfa->init_state_nl) + re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap); + if (dfa->init_state != dfa->init_state_begbuf) + re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap); + bufp->fastmap_accurate = 1; + return 0; +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__re_compile_fastmap, re_compile_fastmap) +#endif + +static inline void +__attribute ((always_inline)) +re_set_fastmap (char *fastmap, int icase, int ch) +{ + fastmap[ch] = 1; + if (icase) + fastmap[tolower (ch)] = 1; +} + +/* Helper function for re_compile_fastmap. + Compile fastmap for the initial_state INIT_STATE. */ + +static void +re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state, + char *fastmap) +{ + re_dfa_t *dfa = (re_dfa_t *) bufp->buffer; + int node_cnt; + int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE)); + for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt) + { + int node = init_state->nodes.elems[node_cnt]; + re_token_type_t type = dfa->nodes[node].type; + + if (type == CHARACTER) + { + re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c); +#ifdef RE_ENABLE_I18N + if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + { + unsigned char *buf = alloca (dfa->mb_cur_max), *p; + wchar_t wc; + mbstate_t state; + + p = buf; + *p++ = dfa->nodes[node].opr.c; + while (++node < dfa->nodes_len + && dfa->nodes[node].type == CHARACTER + && dfa->nodes[node].mb_partial) + *p++ = dfa->nodes[node].opr.c; + memset (&state, '\0', sizeof (state)); + if (mbrtowc (&wc, (const char *) buf, p - buf, + &state) == p - buf + && (__wcrtomb ((char *) buf, towlower (wc), &state) + != (size_t) -1)) + re_set_fastmap (fastmap, 0, buf[0]); + } +#endif + } + else if (type == SIMPLE_BRACKET) + { + int i, ch; + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + { + int j; + bitset_word_t w = dfa->nodes[node].opr.sbcset[i]; + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) + if (w & ((bitset_word_t) 1 << j)) + re_set_fastmap (fastmap, icase, ch); + } + } +#ifdef RE_ENABLE_I18N + else if (type == COMPLEX_BRACKET) + { + int i; + re_charset_t *cset = dfa->nodes[node].opr.mbcset; + if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes + || cset->nranges || cset->nchar_classes) + { +# ifdef _LIBC + if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0) + { + /* In this case we want to catch the bytes which are + the first byte of any collation elements. + e.g. In da_DK, we want to catch 'a' since "aa" + is a valid collation element, and don't catch + 'b' since 'b' is the only collation element + which starts from 'b'. */ + const int32_t *table = (const int32_t *) + _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB); + for (i = 0; i < SBC_MAX; ++i) + if (table[i] < 0) + re_set_fastmap (fastmap, icase, i); + } +# else + if (dfa->mb_cur_max > 1) + for (i = 0; i < SBC_MAX; ++i) + if (__btowc (i) == WEOF) + re_set_fastmap (fastmap, icase, i); +# endif /* not _LIBC */ + } + for (i = 0; i < cset->nmbchars; ++i) + { + char buf[256]; + mbstate_t state; + memset (&state, '\0', sizeof (state)); + if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1) + re_set_fastmap (fastmap, icase, *(unsigned char *) buf); + if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1) + { + if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state) + != (size_t) -1) + re_set_fastmap (fastmap, 0, *(unsigned char *) buf); + } + } + } +#endif /* RE_ENABLE_I18N */ + else if (type == OP_PERIOD +#ifdef RE_ENABLE_I18N + || type == OP_UTF8_PERIOD +#endif /* RE_ENABLE_I18N */ + || type == END_OF_RE) + { + memset (fastmap, '\1', sizeof (char) * SBC_MAX); + if (type == END_OF_RE) + bufp->can_be_null = 1; + return; + } + } +} + +/* Entry point for POSIX code. */ +/* regcomp takes a regular expression as a string and compiles it. + + PREG is a regex_t *. We do not expect any fields to be initialized, + since POSIX says we shouldn't. Thus, we set + + `buffer' to the compiled pattern; + `used' to the length of the compiled pattern; + `syntax' to RE_SYNTAX_POSIX_EXTENDED if the + REG_EXTENDED bit in CFLAGS is set; otherwise, to + RE_SYNTAX_POSIX_BASIC; + `newline_anchor' to REG_NEWLINE being set in CFLAGS; + `fastmap' to an allocated space for the fastmap; + `fastmap_accurate' to zero; + `re_nsub' to the number of subexpressions in PATTERN. + + PATTERN is the address of the pattern string. + + CFLAGS is a series of bits which affect compilation. + + If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we + use POSIX basic syntax. + + If REG_NEWLINE is set, then . and [^...] don't match newline. + Also, regexec will try a match beginning after every newline. + + If REG_ICASE is set, then we considers upper- and lowercase + versions of letters to be equivalent when matching. + + If REG_NOSUB is set, then when PREG is passed to regexec, that + routine will report only success or failure, and nothing about the + registers. + + It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for + the return codes and their meanings.) */ + +int +regcomp (preg, pattern, cflags) + regex_t *__restrict preg; + const char *__restrict pattern; + int cflags; +{ + reg_errcode_t ret; + reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED + : RE_SYNTAX_POSIX_BASIC); + + preg->buffer = NULL; + preg->allocated = 0; + preg->used = 0; + + /* Try to allocate space for the fastmap. */ + preg->fastmap = re_malloc (char, SBC_MAX); + if (BE (preg->fastmap == NULL, 0)) + return REG_ESPACE; + + syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0; + + /* If REG_NEWLINE is set, newlines are treated differently. */ + if (cflags & REG_NEWLINE) + { /* REG_NEWLINE implies neither . nor [^...] match newline. */ + syntax &= ~RE_DOT_NEWLINE; + syntax |= RE_HAT_LISTS_NOT_NEWLINE; + /* It also changes the matching behavior. */ + preg->newline_anchor = 1; + } + else + preg->newline_anchor = 0; + preg->no_sub = !!(cflags & REG_NOSUB); + preg->translate = NULL; + + ret = re_compile_internal (preg, pattern, strlen (pattern), syntax); + + /* POSIX doesn't distinguish between an unmatched open-group and an + unmatched close-group: both are REG_EPAREN. */ + if (ret == REG_ERPAREN) + ret = REG_EPAREN; + + /* We have already checked preg->fastmap != NULL. */ + if (BE (ret == REG_NOERROR, 1)) + /* Compute the fastmap now, since regexec cannot modify the pattern + buffer. This function never fails in this implementation. */ + (void) re_compile_fastmap (preg); + else + { + /* Some error occurred while compiling the expression. */ + re_free (preg->fastmap); + preg->fastmap = NULL; + } + + return (int) ret; +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__regcomp, regcomp) +#endif + +/* Returns a message corresponding to an error code, ERRCODE, returned + from either regcomp or regexec. We don't use PREG here. */ + +size_t +regerror (errcode, preg, errbuf, errbuf_size) + int errcode; + const regex_t *__restrict preg; + char *__restrict errbuf; + size_t errbuf_size; +{ + const char *msg; + size_t msg_size; + + if (BE (errcode < 0 + || errcode >= (int) (sizeof (__re_error_msgid_idx) + / sizeof (__re_error_msgid_idx[0])), 0)) + /* Only error codes returned by the rest of the code should be passed + to this routine. If we are given anything else, or if other regex + code generates an invalid error code, then the program has a bug. + Dump core so we can fix it. */ + abort (); + + msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]); + + msg_size = strlen (msg) + 1; /* Includes the null. */ + + if (BE (errbuf_size != 0, 1)) + { + if (BE (msg_size > errbuf_size, 0)) + { +#if defined HAVE_MEMPCPY || defined _LIBC + *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0'; +#else + memcpy (errbuf, msg, errbuf_size - 1); + errbuf[errbuf_size - 1] = 0; +#endif + } + else + memcpy (errbuf, msg, msg_size); + } + + return msg_size; +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__regerror, regerror) +#endif + + +#ifdef RE_ENABLE_I18N +/* This static array is used for the map to single-byte characters when + UTF-8 is used. Otherwise we would allocate memory just to initialize + it the same all the time. UTF-8 is the preferred encoding so this is + a worthwhile optimization. */ +static const bitset_t utf8_sb_map = +{ + /* Set the first 128 bits. */ + [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX +}; +#endif + + +static void +free_dfa_content (re_dfa_t *dfa) +{ + int i, j; + + if (dfa->nodes) + for (i = 0; i < dfa->nodes_len; ++i) + free_token (dfa->nodes + i); + re_free (dfa->nexts); + for (i = 0; i < dfa->nodes_len; ++i) + { + if (dfa->eclosures != NULL) + re_node_set_free (dfa->eclosures + i); + if (dfa->inveclosures != NULL) + re_node_set_free (dfa->inveclosures + i); + if (dfa->edests != NULL) + re_node_set_free (dfa->edests + i); + } + re_free (dfa->edests); + re_free (dfa->eclosures); + re_free (dfa->inveclosures); + re_free (dfa->nodes); + + if (dfa->state_table) + for (i = 0; i <= dfa->state_hash_mask; ++i) + { + struct re_state_table_entry *entry = dfa->state_table + i; + for (j = 0; j < entry->num; ++j) + { + re_dfastate_t *state = entry->array[j]; + free_state (state); + } + re_free (entry->array); + } + re_free (dfa->state_table); +#ifdef RE_ENABLE_I18N + if (dfa->sb_char != utf8_sb_map) + re_free (dfa->sb_char); +#endif + re_free (dfa->subexp_map); +#ifdef DEBUG + re_free (dfa->re_str); +#endif + + re_free (dfa); +} + + +/* Free dynamically allocated space used by PREG. */ + +void +regfree (preg) + regex_t *preg; +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + if (BE (dfa != NULL, 1)) + free_dfa_content (dfa); + preg->buffer = NULL; + preg->allocated = 0; + + re_free (preg->fastmap); + preg->fastmap = NULL; + + re_free (preg->translate); + preg->translate = NULL; +} +#if defined _LIBC || defined __UCLIBC__ +weak_alias (__regfree, regfree) +#endif + +/* Entry points compatible with 4.2 BSD regex library. We don't define + them unless specifically requested. */ + +#if defined _REGEX_RE_COMP || defined _LIBC || defined __UCLIBC__ + +/* BSD has one and only one pattern buffer. */ +static struct re_pattern_buffer re_comp_buf; + +char * +# if defined _LIBC || defined __UCLIBC__ +/* Make these definitions weak in libc, so POSIX programs can redefine + these names if they don't use our functions, and still use + regcomp/regexec above without link errors. */ +weak_function +# endif +re_comp (s) + const char *s; +{ + reg_errcode_t ret; + char *fastmap; + + if (!s) + { + if (!re_comp_buf.buffer) + return gettext ("No previous regular expression"); + return 0; + } + + if (re_comp_buf.buffer) + { + fastmap = re_comp_buf.fastmap; + re_comp_buf.fastmap = NULL; + __regfree (&re_comp_buf); + memset (&re_comp_buf, '\0', sizeof (re_comp_buf)); + re_comp_buf.fastmap = fastmap; + } + + if (re_comp_buf.fastmap == NULL) + { + re_comp_buf.fastmap = (char *) malloc (SBC_MAX); + if (re_comp_buf.fastmap == NULL) + return (char *) gettext (__re_error_msgid + + __re_error_msgid_idx[(int) REG_ESPACE]); + } + + /* Since `re_exec' always passes NULL for the `regs' argument, we + don't need to initialize the pattern buffer fields which affect it. */ + + /* Match anchors at newlines. */ + re_comp_buf.newline_anchor = 1; + + ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options); + + if (!ret) + return NULL; + + /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */ + return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]); +} + +#ifdef _LIBC +libc_freeres_fn (free_mem) +{ + __regfree (&re_comp_buf); +} +#endif + +#endif /* _REGEX_RE_COMP */ + +/* Internal entry point. + Compile the regular expression PATTERN, whose length is LENGTH. + SYNTAX indicate regular expression's syntax. */ + +static reg_errcode_t +re_compile_internal (regex_t *preg, const char * pattern, size_t length, + reg_syntax_t syntax) +{ + reg_errcode_t err = REG_NOERROR; + re_dfa_t *dfa; + re_string_t regexp; + + /* Initialize the pattern buffer. */ + preg->fastmap_accurate = 0; + preg->syntax = syntax; + preg->not_bol = preg->not_eol = 0; + preg->used = 0; + preg->re_nsub = 0; + preg->can_be_null = 0; + preg->regs_allocated = REGS_UNALLOCATED; + + /* Initialize the dfa. */ + dfa = (re_dfa_t *) preg->buffer; + if (BE (preg->allocated < sizeof (re_dfa_t), 0)) + { + /* If zero allocated, but buffer is non-null, try to realloc + enough space. This loses if buffer's address is bogus, but + that is the user's responsibility. If ->buffer is NULL this + is a simple allocation. */ + dfa = re_realloc (preg->buffer, re_dfa_t, 1); + if (dfa == NULL) + return REG_ESPACE; + preg->allocated = sizeof (re_dfa_t); + preg->buffer = (unsigned char *) dfa; + } + preg->used = sizeof (re_dfa_t); + + err = init_dfa (dfa, length); + if (BE (err != REG_NOERROR, 0)) + { + free_dfa_content (dfa); + preg->buffer = NULL; + preg->allocated = 0; + return err; + } +#ifdef DEBUG + /* Note: length+1 will not overflow since it is checked in init_dfa. */ + dfa->re_str = re_malloc (char, length + 1); + strncpy (dfa->re_str, pattern, length + 1); +#endif + + __libc_lock_init (dfa->lock); + + err = re_string_construct (®exp, pattern, length, preg->translate, + syntax & RE_ICASE, dfa); + if (BE (err != REG_NOERROR, 0)) + { + re_compile_internal_free_return: + free_workarea_compile (preg); + re_string_destruct (®exp); + free_dfa_content (dfa); + preg->buffer = NULL; + preg->allocated = 0; + return err; + } + + /* Parse the regular expression, and build a structure tree. */ + preg->re_nsub = 0; + dfa->str_tree = parse (®exp, preg, syntax, &err); + if (BE (dfa->str_tree == NULL, 0)) + goto re_compile_internal_free_return; + + /* Analyze the tree and create the nfa. */ + err = analyze (preg); + if (BE (err != REG_NOERROR, 0)) + goto re_compile_internal_free_return; + +#ifdef RE_ENABLE_I18N + /* If possible, do searching in single byte encoding to speed things up. */ + if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL) + optimize_utf8 (dfa); +#endif + + /* Then create the initial state of the dfa. */ + err = create_initial_state (dfa); + + /* Release work areas. */ + free_workarea_compile (preg); + re_string_destruct (®exp); + + if (BE (err != REG_NOERROR, 0)) + { + free_dfa_content (dfa); + preg->buffer = NULL; + preg->allocated = 0; + } + + return err; +} + +/* Initialize DFA. We use the length of the regular expression PAT_LEN + as the initial length of some arrays. */ + +static reg_errcode_t +init_dfa (re_dfa_t *dfa, size_t pat_len) +{ + unsigned int table_size; +#ifndef _LIBC + char *codeset_name; +#endif + + memset (dfa, '\0', sizeof (re_dfa_t)); + + /* Force allocation of str_tree_storage the first time. */ + dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; + + /* Avoid overflows. */ + if (pat_len == SIZE_MAX) + return REG_ESPACE; + + dfa->nodes_alloc = pat_len + 1; + dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc); + + /* table_size = 2 ^ ceil(log pat_len) */ + for (table_size = 1; ; table_size <<= 1) + if (table_size > pat_len) + break; + + dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size); + dfa->state_hash_mask = table_size - 1; + + dfa->mb_cur_max = MB_CUR_MAX; +#ifdef _LIBC + if (dfa->mb_cur_max == 6 + && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0) + dfa->is_utf8 = 1; + dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII) + != 0); +#else +# ifdef HAVE_LANGINFO_CODESET + codeset_name = nl_langinfo (CODESET); +# else + codeset_name = getenv ("LC_ALL"); + if (codeset_name == NULL || codeset_name[0] == '\0') + codeset_name = getenv ("LC_CTYPE"); + if (codeset_name == NULL || codeset_name[0] == '\0') + codeset_name = getenv ("LANG"); + if (codeset_name == NULL) + codeset_name = ""; + else if (strchr (codeset_name, '.') != NULL) + codeset_name = strchr (codeset_name, '.') + 1; +# endif + + if (strcasecmp (codeset_name, "UTF-8") == 0 + || strcasecmp (codeset_name, "UTF8") == 0) + dfa->is_utf8 = 1; + + /* We check exhaustively in the loop below if this charset is a + superset of ASCII. */ + dfa->map_notascii = 0; +#endif + +#ifdef RE_ENABLE_I18N + if (dfa->mb_cur_max > 1) + { + if (dfa->is_utf8) + dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map; + else + { + int i, j, ch; + + dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1); + if (BE (dfa->sb_char == NULL, 0)) + return REG_ESPACE; + + /* Set the bits corresponding to single byte chars. */ + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) + { + wint_t wch = __btowc (ch); + if (wch != WEOF) + dfa->sb_char[i] |= (bitset_word_t) 1 << j; +# ifndef _LIBC + if (isascii (ch) && wch != ch) + dfa->map_notascii = 1; +# endif + } + } + } +#endif + + if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0)) + return REG_ESPACE; + return REG_NOERROR; +} + +/* Initialize WORD_CHAR table, which indicate which character is + "word". In this case "word" means that it is the word construction + character used by some operators like "\<", "\>", etc. */ + +static void +internal_function +init_word_char (re_dfa_t *dfa) +{ + int i, j, ch; + dfa->word_ops_used = 1; + for (i = 0, ch = 0; i < BITSET_WORDS; ++i) + for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch) + if (isalnum (ch) || ch == '_') + dfa->word_char[i] |= (bitset_word_t) 1 << j; +} + +/* Free the work area which are only used while compiling. */ + +static void +free_workarea_compile (regex_t *preg) +{ + re_dfa_t *dfa = (re_dfa_t *) preg->buffer; + bin_tree_storage_t *storage, *next; + for (storage = dfa->str_tree_storage; storage; storage = next) + { + next = storage->next; + re_free (storage); + } + dfa->str_tree_storage = NULL; + dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE; + dfa->str_tree = NULL; + re_free (dfa->org_indices); + dfa->org_indices = NULL; +} + +/* Create initial states for all contexts. */ + +static reg_errcode_t +create_initial_state (re_dfa_t *dfa) +{ + int first, i; + reg_errcode_t err; + re_node_set init_nodes; + + /* Initial states have the epsilon closure of the node which is + the first node of the regular expression. */ + first = dfa->str_tree->first->node_idx; + dfa->init_node = first; + err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first); + if (BE (err != REG_NOERROR, 0)) + return err; + + /* The back-references which are in initial states can epsilon transit, + since in this case all of the subexpressions can be null. + Then we add epsilon closures of the nodes which are the next nodes of + the back-references. */ + if (dfa->nbackref > 0) + for (i = 0; i < init_nodes.nelem; ++i) + { + int node_idx = init_nodes.elems[i]; + re_token_type_t type = dfa->nodes[node_idx].type; + + int clexp_idx; + if (type != OP_BACK_REF) + continue; + for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx) + { + re_token_t *clexp_node; + clexp_node = dfa->nodes + init_nodes.elems[clexp_idx]; + if (clexp_node->type == OP_CLOSE_SUBEXP + && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx) + break; |