mirror of
https://github.com/fluencelabs/musl
synced 2025-06-26 21:22:11 +00:00
fix ugly bugs in TRE regex parser
1. * in BRE is not special at the beginning of the regex or a subexpression. this broke ncurses' build scripts. 2. \\( in BRE is a literal \ followed by a literal (, not a literal \ followed by a subexpression opener. 3. the ^ in \\(^ in BRE is a literal ^ only at the beginning of the entire BRE. POSIX allows treating it as an anchor at the beginning of a subexpression, but TRE's code for checking if it was at the beginning of a subexpression was wrong, and fixing it for the sake of supporting a non-portable usage was too much trouble when just removing this non-portable behavior was much easier. this patch also moved lots of the ugly logic for empty atom checking out of the default/literal case and into new cases for the relevant characters. this should make parsing faster and make the code smaller. if nothing else it's a lot more readable/logical. at some point i'd like to revisit and overhaul lots of this code...
This commit is contained in:
@ -961,6 +961,8 @@ tre_parse(tre_parse_ctx_t *ctx)
|
|||||||
tre_stack_t *stack = ctx->stack;
|
tre_stack_t *stack = ctx->stack;
|
||||||
int bottom = tre_stack_num_objects(stack);
|
int bottom = tre_stack_num_objects(stack);
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
|
wchar_t wc;
|
||||||
|
int clen;
|
||||||
|
|
||||||
if (!ctx->nofirstsub)
|
if (!ctx->nofirstsub)
|
||||||
{
|
{
|
||||||
@ -1155,10 +1157,9 @@ tre_parse(tre_parse_ctx_t *ctx)
|
|||||||
{
|
{
|
||||||
case CHAR_LPAREN: /* parenthesized subexpression */
|
case CHAR_LPAREN: /* parenthesized subexpression */
|
||||||
|
|
||||||
if (ctx->cflags & REG_EXTENDED
|
if (ctx->cflags & REG_EXTENDED)
|
||||||
|| (ctx->re > ctx->re_start
|
|
||||||
&& *(ctx->re - 1) == CHAR_BACKSLASH))
|
|
||||||
{
|
{
|
||||||
|
lparen:
|
||||||
depth++;
|
depth++;
|
||||||
{
|
{
|
||||||
ctx->re++;
|
ctx->re++;
|
||||||
@ -1174,25 +1175,6 @@ tre_parse(tre_parse_ctx_t *ctx)
|
|||||||
goto parse_literal;
|
goto parse_literal;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case CHAR_RPAREN: /* end of current subexpression */
|
|
||||||
if ((ctx->cflags & REG_EXTENDED && depth > 0)
|
|
||||||
|| (ctx->re > ctx->re_start
|
|
||||||
&& *(ctx->re - 1) == CHAR_BACKSLASH))
|
|
||||||
{
|
|
||||||
/* We were expecting an atom, but instead the current
|
|
||||||
subexpression was closed. POSIX leaves the meaning of
|
|
||||||
this to be implementation-defined. We interpret this as
|
|
||||||
an empty expression (which matches an empty string). */
|
|
||||||
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
|
|
||||||
if (result == NULL)
|
|
||||||
return REG_ESPACE;
|
|
||||||
if (!(ctx->cflags & REG_EXTENDED))
|
|
||||||
ctx->re--;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
goto parse_literal;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case CHAR_LBRACKET: /* bracket expression */
|
case CHAR_LBRACKET: /* bracket expression */
|
||||||
ctx->re++;
|
ctx->re++;
|
||||||
status = tre_parse_bracket(ctx, &result);
|
status = tre_parse_bracket(ctx, &result);
|
||||||
@ -1203,13 +1185,14 @@ tre_parse(tre_parse_ctx_t *ctx)
|
|||||||
case CHAR_BACKSLASH:
|
case CHAR_BACKSLASH:
|
||||||
/* If this is "\(" or "\)" chew off the backslash and
|
/* If this is "\(" or "\)" chew off the backslash and
|
||||||
try again. */
|
try again. */
|
||||||
if (!(ctx->cflags & REG_EXTENDED)
|
if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
|
||||||
&& (*(ctx->re + 1) == CHAR_LPAREN
|
|
||||||
|| *(ctx->re + 1) == CHAR_RPAREN))
|
|
||||||
{
|
{
|
||||||
ctx->re++;
|
ctx->re++;
|
||||||
STACK_PUSHX(stack, int, PARSE_ATOM);
|
goto lparen;
|
||||||
break;
|
}
|
||||||
|
if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
|
||||||
|
{
|
||||||
|
goto empty_atom;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If a macro is used, parse the expanded macro recursively. */
|
/* If a macro is used, parse the expanded macro recursively. */
|
||||||
@ -1369,12 +1352,9 @@ tre_parse(tre_parse_ctx_t *ctx)
|
|||||||
break;
|
break;
|
||||||
|
|
||||||
case CHAR_CARET: /* beginning of line assertion */
|
case CHAR_CARET: /* beginning of line assertion */
|
||||||
/* '^' has a special meaning everywhere in EREs, and in the
|
/* '^' has a special meaning everywhere in EREs, and at
|
||||||
beginning of the RE and after \( is BREs. */
|
beginning of BRE. */
|
||||||
if (ctx->cflags & REG_EXTENDED
|
if (ctx->cflags & REG_EXTENDED
|
||||||
|| (ctx->re - 2 >= ctx->re_start
|
|
||||||
&& *(ctx->re - 2) == CHAR_BACKSLASH
|
|
||||||
&& *(ctx->re - 1) == CHAR_LPAREN)
|
|
||||||
|| ctx->re == ctx->re_start)
|
|| ctx->re == ctx->re_start)
|
||||||
{
|
{
|
||||||
result = tre_ast_new_literal(ctx->mem, ASSERTION,
|
result = tre_ast_new_literal(ctx->mem, ASSERTION,
|
||||||
@ -1389,10 +1369,8 @@ tre_parse(tre_parse_ctx_t *ctx)
|
|||||||
|
|
||||||
case CHAR_DOLLAR: /* end of line assertion. */
|
case CHAR_DOLLAR: /* end of line assertion. */
|
||||||
/* '$' is special everywhere in EREs, and in the end of the
|
/* '$' is special everywhere in EREs, and in the end of the
|
||||||
string and before \) is BREs. */
|
string in BREs. */
|
||||||
if (ctx->cflags & REG_EXTENDED
|
if (ctx->cflags & REG_EXTENDED
|
||||||
|| (*(ctx->re + 1) == CHAR_BACKSLASH
|
|
||||||
&& *(ctx->re + 2) == CHAR_RPAREN)
|
|
||||||
|| !*(ctx->re + 1))
|
|| !*(ctx->re + 1))
|
||||||
{
|
{
|
||||||
result = tre_ast_new_literal(ctx->mem, ASSERTION,
|
result = tre_ast_new_literal(ctx->mem, ASSERTION,
|
||||||
@ -1405,34 +1383,27 @@ tre_parse(tre_parse_ctx_t *ctx)
|
|||||||
goto parse_literal;
|
goto parse_literal;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case CHAR_RPAREN:
|
||||||
|
if (!depth)
|
||||||
|
goto parse_literal;
|
||||||
|
case CHAR_STAR:
|
||||||
|
case CHAR_PIPE:
|
||||||
|
case CHAR_LBRACE:
|
||||||
|
case CHAR_PLUS:
|
||||||
|
case CHAR_QUESTIONMARK:
|
||||||
|
if (!(ctx->cflags & REG_EXTENDED))
|
||||||
|
goto parse_literal;
|
||||||
|
|
||||||
|
empty_atom:
|
||||||
|
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
|
||||||
|
if (!result)
|
||||||
|
return REG_ESPACE;
|
||||||
|
break;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
parse_literal:
|
parse_literal:
|
||||||
|
|
||||||
/* We are expecting an atom. If the subexpression (or the whole
|
clen = mbtowc(&wc, ctx->re, -1);
|
||||||
regexp ends here, we interpret it as an empty expression
|
|
||||||
(which matches an empty string). */
|
|
||||||
if (
|
|
||||||
(!*ctx->re
|
|
||||||
|| *ctx->re == CHAR_STAR
|
|
||||||
|| (ctx->cflags & REG_EXTENDED
|
|
||||||
&& (*ctx->re == CHAR_PIPE
|
|
||||||
|| *ctx->re == CHAR_LBRACE
|
|
||||||
|| *ctx->re == CHAR_PLUS
|
|
||||||
|| *ctx->re == CHAR_QUESTIONMARK))
|
|
||||||
/* Test for "\)" in BRE mode. */
|
|
||||||
|| (!(ctx->cflags & REG_EXTENDED)
|
|
||||||
&& !*(ctx->re + 1)
|
|
||||||
&& *ctx->re == CHAR_BACKSLASH
|
|
||||||
&& *(ctx->re + 1) == CHAR_LBRACE)))
|
|
||||||
{
|
|
||||||
result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
|
|
||||||
if (!result)
|
|
||||||
return REG_ESPACE;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
wchar_t wc;
|
|
||||||
int clen = mbtowc(&wc, ctx->re, -1);
|
|
||||||
if (clen<0) clen=1, wc=WEOF;
|
if (clen<0) clen=1, wc=WEOF;
|
||||||
|
|
||||||
/* Note that we can't use an tre_isalpha() test here, since there
|
/* Note that we can't use an tre_isalpha() test here, since there
|
||||||
|
Reference in New Issue
Block a user