diff options
author | Eric Andersen <andersen@codepoet.org> | 2000-10-20 03:48:11 +0000 |
---|---|---|
committer | Eric Andersen <andersen@codepoet.org> | 2000-10-20 03:48:11 +0000 |
commit | 82d766043c6a8dcf6283788419f110dd7ab52f80 (patch) | |
tree | 09505131008d1b4d2178065878c3e8e0d54c26a2 /include/regexp.h | |
parent | 5ce562fc21a7fb6385dc054c8df17009f68b05ae (diff) |
A smaller, kinder, gentler regexp implementation.
Diffstat (limited to 'include/regexp.h')
-rw-r--r-- | include/regexp.h | 423 |
1 files changed, 210 insertions, 213 deletions
diff --git a/include/regexp.h b/include/regexp.h index 174e10b75..fc60d3ca5 100644 --- a/include/regexp.h +++ b/include/regexp.h @@ -1,224 +1,221 @@ -/* - * regexp.h -- old-style regexp compile and step (emulated with POSIX regex) - * Copyright (C) 1993 Rick Sladkey <jrs@world.std.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Library Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Library Public License for more details. - */ - -/* - * Think really hard before you intentionally include this file. - * You should really be using the POSIX regex interface instead. - * This emulation file is intended solely for compiling old code. - * - * A program that uses this file must define six macros: INIT, - * GETC, PEEKC, UNGETC, RETURN, and ERROR. This interface is - * so arcane that VMS hackers point at it in ridicule. - */ +/* Copyright (C) 1996, 1997, 1998, 1999 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Ulrich Drepper <drepper@cygnus.com>, 1996. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ #ifndef _REGEXP_H -#define _REGEXP_H - -#include <sys/types.h> /* regex.h needs size_t */ -#include <regex.h> /* POSIX.2 regexp routines */ -#include <stdlib.h> /* for malloc, realloc and free */ - -/* - * These three advertised external variables record state information - * for compile and step. They are so gross, I'm choking as I write this. - */ -char *loc1; /* the beginning of a match */ -char *loc2; /* the end of a match */ -int circf; /* current pattern begins with '^' */ - -/* - * These are the other variables mentioned in the regexp.h manpage. - * Since we don't emulate them (whatever they do), we want errors if - * they are referenced. Therefore they are commented out here. - */ -#if 0 -char *locs; -int sed; -int nbra; -#endif +#define _REGEXP_H 1 + +/* The contents of this header file was first standardized in X/Open + System Interface and Headers Issue 2, originally coming from SysV. + In issue 4, version 2, it is marked as TO BE WITDRAWN. + + This code shouldn't be used in any newly written code. It is + included only for compatibility reasons. Use the POSIX definition + in <regex.h> for portable applications and a reasonable interface. */ + +#include <features.h> +#include <alloca.h> +#include <regex.h> +#include <stdlib.h> +#include <string.h> + +/* The implementation provided here emulates the needed functionality + by mapping to the POSIX regular expression matcher. The interface + for the here included function is weird (this really is a harmless + word). + + The user has to provide six macros before this header file can be + included: + + INIT Declarations vor variables which can be used by the + other macros. + + GETC() Return the value of the next character in the regular + expression pattern. Successive calls should return + successive characters. + + PEEKC() Return the value of the next character in the regular + expression pattern. Immediately successive calls to + PEEKC() should return the same character which should + also be the next character returned by GETC(). + + UNGETC(c) Cause `c' to be returned by the next call to GETC() and + PEEKC(). + + RETURN(ptr) Used for normal exit of the `compile' function. `ptr' + is a pointer to the character after the last character of + the compiled regular expression. + + ERROR(val) Used for abnormal return from `compile'. `val' is the + error number. The error codes are: + 11 Range endpoint too large. + 16 Bad number. + 25 \digit out of range. + 36 Illegal or missing delimiter. + 41 No remembered search string. + 42 \( \) imbalance. + 43 Too many \(. + 44 More tan two numbers given in \{ \}. + 45 } expected after \. + 46 First number exceeds second in \{ \}. + 49 [ ] imbalance. + 50 Regular expression overflow. + + */ + +__BEGIN_DECLS + +/* Interface variables. They contain the results of the successful + calls to `setp' and `advance'. */ +extern char *loc1; +extern char *loc2; + +/* The use of this variable in the `advance' function is not + supported. */ +extern char *locs; + -/* - * We need to stuff a regex_t into an arbitrary buffer so align it. - * GCC make this easy. For the others we have to guess. - */ -#ifdef __GNUC__ -#define __REGEX_T_ALIGN (__alignof__(regex_t)) -#else /* !__GNUC__ */ -#define __REGEX_T_ALIGN 8 -#endif /* !__GNUC__ */ - -#define __regex_t_align(p) \ - ((regex_t *) ((((unsigned long) p) + __REGEX_T_ALIGN - 1) \ - / __REGEX_T_ALIGN * __REGEX_T_ALIGN)) - -/* - * We just slurp the whole pattern into a string and then compile - * it `normally'. With this implementation we never use the PEEKC - * macro. Please feel free to die laughing when we translate - * error symbols into hard-coded numbers. - */ +#ifndef __DO_NOT_DEFINE_COMPILE +/* Get and compile the user supplied pattern up to end of line or + string or until EOF is seen, whatever happens first. The result is + placed in the buffer starting at EXPBUF and delimited by ENDBUF. + + This function cannot be defined in the libc itself since it depends + on the macros. */ char * -compile(char *instring, char *expbuf, char *endbuf, int eof) +compile (char *__restrict instring, char *__restrict expbuf, + __const char *__restrict endbuf, int eof) { - int __c; - int __len; - char *__buf; - int __buflen; - int __error; - regex_t *__preg; - INIT; - - __buflen = 128; - __buf = malloc(__buflen); - if (!__buf) { - ERROR(50); - return 0; - } - __len = 0; - circf = 0; - for (;;) { - __c = GETC(); - if (__c == eof) - break; - if (__c == '\0' || __c == '\n') { - UNGETC(__c); - break; - } - if (__len + 2 > __buflen) { - __buflen *= 2; - __buf = realloc(__buf, __buflen); - if (!__buf) { - ERROR(50); - return 0; - } - } - if (__len == 0 && !circf && __c == '^') - circf = 1; - else - __buf[__len++] = __c; - } - if (__len == 0 && !circf) { - free(__buf); - ERROR(41); - return 0; - } - __buf[__len] = '\0'; - if (endbuf <= expbuf + sizeof(regex_t)) { - free(__buf); - ERROR(50); - return 0; - } - __preg = __regex_t_align(expbuf); - __preg->buffer = (char *) (__preg + 1); - __preg->allocated = endbuf - (char *) __preg->buffer; - __error = regcomp(__preg, __buf, REG_NEWLINE); - free(__buf); - switch (__error) { - case 0: - break; - case REG_BADRPT: - __error = 36; /* poor fit */ - break; - case REG_BADBR: - __error = 16; - break; - case REG_EBRACE: - __error = 44; /* poor fit */ - break; - case REG_EBRACK: - __error = 49; - break; - case REG_ERANGE: - __error = 36; /* poor fit */ - break; - case REG_ECTYPE: - __error = 36; /* poor fit */ - break; - case REG_EPAREN: - __error = 42; - break; - case REG_ESUBREG: - __error = 36; /* poor fit */ - break; - case REG_EEND: - __error = 36; /* poor fit */ - break; - case REG_EESCAPE: - __error = 36; - break; - case REG_BADPAT: - __error = 36; /* poor fit */ - break; - case REG_ESIZE: - __error = 50; - break; - case REG_ESPACE: - __error = 50; - break; - default: - __error = 36; /* as good as any */ - break; - } - if (__error) { - ERROR(__error); - return 0; + char *__input_buffer = NULL; + size_t __input_size = 0; + size_t __current_size = 0; + int __ch; + int __error; + INIT + + /* Align the expression buffer according to the needs for an object + of type `regex_t'. Then check for minimum size of the buffer for + the compiled regular expression. */ + regex_t *__expr_ptr; +# if defined __GNUC__ && __GNUC__ >= 2 + const size_t __req = __alignof__ (regex_t *); +# else + /* How shall we find out? We simply guess it and can change it is + this really proofs to be wrong. */ + const size_t __req = 8; +# endif + expbuf += __req; + expbuf -= (expbuf - ((char *) 0)) % __req; + if (endbuf < expbuf + sizeof (regex_t)) + { + ERROR (50); + } + __expr_ptr = (regex_t *) expbuf; + /* The remaining space in the buffer can be used for the compiled + pattern. */ + __expr_ptr->buffer = expbuf + sizeof (regex_t); + __expr_ptr->allocated = endbuf - (char *) __expr_ptr->buffer; + + while ((__ch = (GETC ())) != eof) + { + if (__ch == '\0' || __ch == '\n') + { + UNGETC (__ch); + break; } -#ifdef _RX_H - RETURN((__preg->buffer + __preg->rx.allocated - __preg->rx.reserved)); -#else - RETURN((__preg->buffer + __preg->used)); -#endif -} -/* - * Note how we carefully emulate the gross `circf' hack. Otherwise, - * this just looks like an ordinary matching call that records the - * starting and ending match positions. - */ -int -step(char *string, char *expbuf) -{ - int __result; - regmatch_t __pmatch[1]; - - __result = regexec(__regex_t_align(expbuf), string, 1, __pmatch, 0); - if (circf && __pmatch[0].rm_so != 0) - __result = REG_NOMATCH; - if (__result == 0) { - loc1 = string + __pmatch[0].rm_so; - loc2 = string + __pmatch[0].rm_eo; + if (__current_size + 1 >= __input_size) + { + size_t __new_size = __input_size ? 2 * __input_size : 128; + char *__new_room = (char *) alloca (__new_size); + /* See whether we can use the old buffer. */ + if (__new_room + __new_size == __input_buffer) + { + __input_size += __new_size; + __input_buffer = (char *) memcpy (__new_room, __input_buffer, + __current_size); + } + else if (__input_buffer + __input_size == __new_room) + __input_size += __new_size; + else + { + __input_size = __new_size; + __input_buffer = (char *) memcpy (__new_room, __input_buffer, + __current_size); + } } - return __result == 0; -} + __input_buffer[__current_size++] = __ch; + } + __input_buffer[__current_size++] = '\0'; -/* - * For advance we are only supposed to match at the beginning of the - * string. You have to read the man page really carefully to find this - * one. We'll match them kludge-for-kludge. - */ -int -advance(char *string, char *expbuf) -{ - int __old_circf; - int __result; - - __old_circf = circf; - circf = 1; - __result = step(string, expbuf); - circf = __old_circf; - return __result; + /* Now compile the pattern. */ + __error = regcomp (__expr_ptr, __input_buffer, REG_NEWLINE); + if (__error != 0) + /* Oh well, we have to translate POSIX error codes. */ + switch (__error) + { + case REG_BADPAT: + case REG_ECOLLATE: + case REG_ECTYPE: + case REG_EESCAPE: + case REG_BADRPT: + case REG_EEND: + case REG_ERPAREN: + default: + /* There is no matching error code. */ + RETURN (36); + case REG_ESUBREG: + RETURN (25); + case REG_EBRACK: + RETURN (49); + case REG_EPAREN: + RETURN (42); + case REG_EBRACE: + RETURN (44); + case REG_BADBR: + RETURN (46); + case REG_ERANGE: + RETURN (11); + case REG_ESPACE: + case REG_ESIZE: + ERROR (50); + } + + /* Everything is ok. */ + RETURN ((char *) (__expr_ptr->buffer + __expr_ptr->used)); } +#endif + + +/* Find the next match in STRING. The compiled regular expression is + found in the buffer starting at EXPBUF. `loc1' will return the + first character matched and `loc2' points to the next unmatched + character. */ +extern int step __P ((__const char *__restrict __string, + __const char *__restrict __expbuf)); + +/* Match the beginning of STRING with the compiled regular expression + in EXPBUF. If the match is successful `loc2' will contain the + position of the first unmatched character. */ +extern int advance __P ((__const char *__restrict __string, + __const char *__restrict __expbuf)); + + +__END_DECLS -#endif /* _REGEXP_H */ +#endif /* regexp.h */ |