fix ugly bugs in TRE regex parser

1. * in BRE is not special at the beginning of the regex or a subexpression. this broke ncurses' build scripts. 2. \\( in BRE is a literal \ followed by a literal (, not a literal \ followed by a subexpression opener. 3. the ^ in \\(^ in BRE is a literal ^ only at the beginning of the entire BRE. POSIX allows treating it as an anchor at the beginning of a subexpression, but TRE's code for checking if it was at the beginning of a subexpression was wrong, and fixing it for the sake of supporting a non-portable usage was too much trouble when just removing this non-portable behavior was much easier. this patch also moved lots of the ugly logic for empty atom checking out of the default/literal case and into new cases for the relevant characters. this should make parsing faster and make the code smaller. if nothing else it's a lot more readable/logical. at some point i'd like to revisit and overhaul lots of this code...
2025-06-26 21:22:11 +00:00 · 2012-05-07 14:50:49 -04:00
parent 2d3e2a7fc1
commit d7a90b35b9
1 changed files with 31 additions and 60 deletions
--- a/src/regex/regcomp.c
+++ b/src/regex/regcomp.c
@ -961,6 +961,8 @@ tre_parse(tre_parse_ctx_t *ctx)
  tre_stack_t *stack = ctx->stack;
  int bottom = tre_stack_num_objects(stack);
  int depth = 0;
  wchar_t wc;
  int clen;
  if (!ctx->nofirstsub)
    {
@ -1155,10 +1157,9 @@ tre_parse(tre_parse_ctx_t *ctx)
 	    {
 	    case CHAR_LPAREN:  /* parenthesized subexpression */
-	      if (ctx->cflags & REG_EXTENDED
+	      if (ctx->cflags & REG_EXTENDED)
 		  || (ctx->re > ctx->re_start
 		      && *(ctx->re - 1) == CHAR_BACKSLASH))
 		{
 		lparen:
 		  depth++;
 		    {
 		      ctx->re++;
@ -1174,25 +1175,6 @@ tre_parse(tre_parse_ctx_t *ctx)
 		goto parse_literal;
 	      break;
 	    case CHAR_RPAREN:  /* end of current subexpression */
 	      if ((ctx->cflags & REG_EXTENDED && depth > 0)
 		  || (ctx->re > ctx->re_start
 		      && *(ctx->re - 1) == CHAR_BACKSLASH))
 		{
 		  /* We were expecting an atom, but instead the current
 		     subexpression was closed.	POSIX leaves the meaning of
 		     this to be implementation-defined.	 We interpret this as
 		     an empty expression (which matches an empty string).  */
 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 		  if (result == NULL)
 		    return REG_ESPACE;
 		  if (!(ctx->cflags & REG_EXTENDED))
 		    ctx->re--;
 		}
 	      else
 		goto parse_literal;
 	      break;
 	    case CHAR_LBRACKET: /* bracket expression */
 	      ctx->re++;
 	      status = tre_parse_bracket(ctx, &result);
@ -1203,13 +1185,14 @@ tre_parse(tre_parse_ctx_t *ctx)
 	    case CHAR_BACKSLASH:
 	      /* If this is "\(" or "\)" chew off the backslash and
 		 try again. */
-	      if (!(ctx->cflags & REG_EXTENDED)
+	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
 		  && (*(ctx->re + 1) == CHAR_LPAREN
 		      || *(ctx->re + 1) == CHAR_RPAREN))
 		{
 		  ctx->re++;
-		  STACK_PUSHX(stack, int, PARSE_ATOM);
+		  goto lparen;
-		  break;
+		}
 	      if (!(ctx->cflags & REG_EXTENDED) && *(ctx->re + 1) == CHAR_LPAREN)
 		{
 		  goto empty_atom;
 		}
 	      /* If a macro is used, parse the expanded macro recursively. */
@ -1369,12 +1352,9 @@ tre_parse(tre_parse_ctx_t *ctx)
 	      break;
 	    case CHAR_CARET:	 /* beginning of line assertion */
-	      /* '^' has a special meaning everywhere in EREs, and in the
+	      /* '^' has a special meaning everywhere in EREs, and at
-		 beginning of the RE and after \( is BREs. */
+		 beginning of BRE. */
 	      if (ctx->cflags & REG_EXTENDED
 		  || (ctx->re - 2 >= ctx->re_start
 		      && *(ctx->re - 2) == CHAR_BACKSLASH
 		      && *(ctx->re - 1) == CHAR_LPAREN)
 		  || ctx->re == ctx->re_start)
 		{
 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
@ -1389,10 +1369,8 @@ tre_parse(tre_parse_ctx_t *ctx)
 	    case CHAR_DOLLAR:	 /* end of line assertion. */
 	      /* '$' is special everywhere in EREs, and in the end of the
-		 string and before \) is BREs. */
+		 string in BREs. */
 	      if (ctx->cflags & REG_EXTENDED
 		  || (*(ctx->re + 1) == CHAR_BACKSLASH
 		      && *(ctx->re + 2) == CHAR_RPAREN)
 		  || !*(ctx->re + 1))
 		{
 		  result = tre_ast_new_literal(ctx->mem, ASSERTION,
@ -1405,34 +1383,27 @@ tre_parse(tre_parse_ctx_t *ctx)
 		goto parse_literal;
 	      break;
 	    case CHAR_RPAREN:
 	      if (!depth)
 	        goto parse_literal;
 	    case CHAR_STAR:
 	    case CHAR_PIPE:
 	    case CHAR_LBRACE:
 	    case CHAR_PLUS:
 	    case CHAR_QUESTIONMARK:
 	      if (!(ctx->cflags & REG_EXTENDED))
 	        goto parse_literal;
 	    empty_atom:
 	      result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 	      if (!result)
 		return REG_ESPACE;
 	      break;
 	    default:
 	    parse_literal:
-	      /* We are expecting an atom.  If the subexpression (or the whole
+	      clen = mbtowc(&wc, ctx->re, -1);
 		 regexp ends here, we interpret it as an empty expression
 		 (which matches an empty string).  */
 	      if (
 		  (!*ctx->re
 		   || *ctx->re == CHAR_STAR
 		   || (ctx->cflags & REG_EXTENDED
 		       && (*ctx->re == CHAR_PIPE
 			   || *ctx->re == CHAR_LBRACE
 			   || *ctx->re == CHAR_PLUS
 			   || *ctx->re == CHAR_QUESTIONMARK))
 		   /* Test for "\)" in BRE mode. */
 		   || (!(ctx->cflags & REG_EXTENDED)
 		       && !*(ctx->re + 1)
 		       && *ctx->re == CHAR_BACKSLASH
 		       && *(ctx->re + 1) == CHAR_LBRACE)))
 		{
 		  result = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
 		  if (!result)
 		    return REG_ESPACE;
 		  break;
 		}
 	      wchar_t wc;
 	      int clen = mbtowc(&wc, ctx->re, -1);
 	      if (clen<0) clen=1, wc=WEOF;
 	      /* Note that we can't use an tre_isalpha() test here, since there