Regex: Update PCRE to v8.35.
I was über lazy at first, so took libs from SM. But actually it's quite easy to compile, so let's update to latest version \o/.
This commit is contained in:
@@ -66,8 +66,9 @@ string of that length that matches. In UTF8 mode, the result is in characters
|
||||
rather than bytes.
|
||||
|
||||
Arguments:
|
||||
re compiled pattern block
|
||||
code pointer to start of group (the bracket)
|
||||
startcode pointer to start of the whole pattern
|
||||
startcode pointer to start of the whole pattern's code
|
||||
options the compiling options
|
||||
int RECURSE depth
|
||||
|
||||
@@ -78,8 +79,8 @@ Returns: the minimum length
|
||||
*/
|
||||
|
||||
static int
|
||||
find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
|
||||
int recurse_depth)
|
||||
find_minlength(const REAL_PCRE *re, const pcre_uchar *code,
|
||||
const pcre_uchar *startcode, int options, int recurse_depth)
|
||||
{
|
||||
int length = -1;
|
||||
/* PCRE_UTF16 has the same value as PCRE_UTF8. */
|
||||
@@ -129,7 +130,7 @@ for (;;)
|
||||
case OP_SBRAPOS:
|
||||
case OP_ONCE:
|
||||
case OP_ONCE_NC:
|
||||
d = find_minlength(cc, startcode, options, recurse_depth);
|
||||
d = find_minlength(re, cc, startcode, options, recurse_depth);
|
||||
if (d < 0) return d;
|
||||
branchlength += d;
|
||||
do cc += GET(cc, 1); while (*cc == OP_ALT);
|
||||
@@ -175,9 +176,9 @@ for (;;)
|
||||
|
||||
case OP_REVERSE:
|
||||
case OP_CREF:
|
||||
case OP_NCREF:
|
||||
case OP_DNCREF:
|
||||
case OP_RREF:
|
||||
case OP_NRREF:
|
||||
case OP_DNRREF:
|
||||
case OP_DEF:
|
||||
case OP_CALLOUT:
|
||||
case OP_SOD:
|
||||
@@ -341,6 +342,7 @@ for (;;)
|
||||
{
|
||||
case OP_CRPLUS:
|
||||
case OP_CRMINPLUS:
|
||||
case OP_CRPOSPLUS:
|
||||
branchlength++;
|
||||
/* Fall through */
|
||||
|
||||
@@ -348,11 +350,14 @@ for (;;)
|
||||
case OP_CRMINSTAR:
|
||||
case OP_CRQUERY:
|
||||
case OP_CRMINQUERY:
|
||||
case OP_CRPOSSTAR:
|
||||
case OP_CRPOSQUERY:
|
||||
cc++;
|
||||
break;
|
||||
|
||||
case OP_CRRANGE:
|
||||
case OP_CRMINRANGE:
|
||||
case OP_CRPOSRANGE:
|
||||
branchlength += GET2(cc,1);
|
||||
cc += 1 + 2 * IMM2_SIZE;
|
||||
break;
|
||||
@@ -375,7 +380,38 @@ for (;;)
|
||||
matches an empty string (by default it causes a matching failure), so in
|
||||
that case we must set the minimum length to zero. */
|
||||
|
||||
case OP_REF:
|
||||
case OP_DNREF: /* Duplicate named pattern back reference */
|
||||
case OP_DNREFI:
|
||||
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
|
||||
{
|
||||
int count = GET2(cc, 1+IMM2_SIZE);
|
||||
pcre_uchar *slot = (pcre_uchar *)re +
|
||||
re->name_table_offset + GET2(cc, 1) * re->name_entry_size;
|
||||
d = INT_MAX;
|
||||
while (count-- > 0)
|
||||
{
|
||||
ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
|
||||
if (cs == NULL) return -2;
|
||||
do ce += GET(ce, 1); while (*ce == OP_ALT);
|
||||
if (cc > cs && cc < ce)
|
||||
{
|
||||
d = 0;
|
||||
had_recurse = TRUE;
|
||||
break;
|
||||
}
|
||||
else
|
||||
{
|
||||
int dd = find_minlength(re, cs, startcode, options, recurse_depth);
|
||||
if (dd < d) d = dd;
|
||||
}
|
||||
slot += re->name_entry_size;
|
||||
}
|
||||
}
|
||||
else d = 0;
|
||||
cc += 1 + 2*IMM2_SIZE;
|
||||
goto REPEAT_BACK_REFERENCE;
|
||||
|
||||
case OP_REF: /* Single back reference */
|
||||
case OP_REFI:
|
||||
if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
|
||||
{
|
||||
@@ -389,7 +425,7 @@ for (;;)
|
||||
}
|
||||
else
|
||||
{
|
||||
d = find_minlength(cs, startcode, options, recurse_depth);
|
||||
d = find_minlength(re, cs, startcode, options, recurse_depth);
|
||||
}
|
||||
}
|
||||
else d = 0;
|
||||
@@ -397,24 +433,29 @@ for (;;)
|
||||
|
||||
/* Handle repeated back references */
|
||||
|
||||
REPEAT_BACK_REFERENCE:
|
||||
switch (*cc)
|
||||
{
|
||||
case OP_CRSTAR:
|
||||
case OP_CRMINSTAR:
|
||||
case OP_CRQUERY:
|
||||
case OP_CRMINQUERY:
|
||||
case OP_CRPOSSTAR:
|
||||
case OP_CRPOSQUERY:
|
||||
min = 0;
|
||||
cc++;
|
||||
break;
|
||||
|
||||
case OP_CRPLUS:
|
||||
case OP_CRMINPLUS:
|
||||
case OP_CRPOSPLUS:
|
||||
min = 1;
|
||||
cc++;
|
||||
break;
|
||||
|
||||
case OP_CRRANGE:
|
||||
case OP_CRMINRANGE:
|
||||
case OP_CRPOSRANGE:
|
||||
min = GET2(cc, 1);
|
||||
cc += 1 + 2 * IMM2_SIZE;
|
||||
break;
|
||||
@@ -437,7 +478,8 @@ for (;;)
|
||||
had_recurse = TRUE;
|
||||
else
|
||||
{
|
||||
branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
|
||||
branchlength += find_minlength(re, cs, startcode, options,
|
||||
recurse_depth + 1);
|
||||
}
|
||||
cc += 1 + LINK_SIZE;
|
||||
break;
|
||||
@@ -778,6 +820,10 @@ do
|
||||
case OP_COND:
|
||||
case OP_CREF:
|
||||
case OP_DEF:
|
||||
case OP_DNCREF:
|
||||
case OP_DNREF:
|
||||
case OP_DNREFI:
|
||||
case OP_DNRREF:
|
||||
case OP_DOLL:
|
||||
case OP_DOLLM:
|
||||
case OP_END:
|
||||
@@ -786,7 +832,6 @@ do
|
||||
case OP_EXTUNI:
|
||||
case OP_FAIL:
|
||||
case OP_MARK:
|
||||
case OP_NCREF:
|
||||
case OP_NOT:
|
||||
case OP_NOTEXACT:
|
||||
case OP_NOTEXACTI:
|
||||
@@ -818,7 +863,6 @@ do
|
||||
case OP_NOTUPTOI:
|
||||
case OP_NOT_HSPACE:
|
||||
case OP_NOT_VSPACE:
|
||||
case OP_NRREF:
|
||||
case OP_PROP:
|
||||
case OP_PRUNE:
|
||||
case OP_PRUNE_ARG:
|
||||
@@ -835,9 +879,6 @@ do
|
||||
case OP_SOM:
|
||||
case OP_THEN:
|
||||
case OP_THEN_ARG:
|
||||
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
|
||||
case OP_XCLASS:
|
||||
#endif
|
||||
return SSB_FAIL;
|
||||
|
||||
/* We can ignore word boundary tests. */
|
||||
@@ -1183,24 +1224,16 @@ do
|
||||
set_type_bits(start_bits, cbit_digit, table_limit, cd);
|
||||
break;
|
||||
|
||||
/* The cbit_space table has vertical tab as whitespace; we have to
|
||||
ensure it gets set as not whitespace. Luckily, the code value is the
|
||||
same (0x0b) in ASCII and EBCDIC, so we can just adjust the appropriate
|
||||
bit. */
|
||||
/* The cbit_space table has vertical tab as whitespace; we no longer
|
||||
have to play fancy tricks because Perl added VT to its whitespace at
|
||||
release 5.18. PCRE added it at release 8.34. */
|
||||
|
||||
case OP_NOT_WHITESPACE:
|
||||
set_nottype_bits(start_bits, cbit_space, table_limit, cd);
|
||||
start_bits[1] |= 0x08;
|
||||
break;
|
||||
|
||||
/* The cbit_space table has vertical tab as whitespace; we have to
|
||||
avoid setting it. Luckily, the code value is the same (0x0b) in ASCII
|
||||
and EBCDIC, so we can just adjust the appropriate bit. */
|
||||
|
||||
case OP_WHITESPACE:
|
||||
c = start_bits[1]; /* Save in case it was already set */
|
||||
set_type_bits(start_bits, cbit_space, table_limit, cd);
|
||||
start_bits[1] = (start_bits[1] & ~0x08) | c;
|
||||
break;
|
||||
|
||||
case OP_NOT_WORDCHAR:
|
||||
@@ -1221,6 +1254,16 @@ do
|
||||
with a value >= 0xc4 is a potentially valid starter because it starts a
|
||||
character with a value > 255. */
|
||||
|
||||
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
|
||||
case OP_XCLASS:
|
||||
if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0)
|
||||
return SSB_FAIL;
|
||||
/* All bits are set. */
|
||||
if ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0 && (tcode[1 + LINK_SIZE] & XCL_NOT) != 0)
|
||||
return SSB_FAIL;
|
||||
#endif
|
||||
/* Fall through */
|
||||
|
||||
case OP_NCLASS:
|
||||
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
|
||||
if (utf)
|
||||
@@ -1237,8 +1280,21 @@ do
|
||||
case OP_CLASS:
|
||||
{
|
||||
pcre_uint8 *map;
|
||||
tcode++;
|
||||
map = (pcre_uint8 *)tcode;
|
||||
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
|
||||
map = NULL;
|
||||
if (*tcode == OP_XCLASS)
|
||||
{
|
||||
if ((tcode[1 + LINK_SIZE] & XCL_MAP) != 0)
|
||||
map = (pcre_uint8 *)(tcode + 1 + LINK_SIZE + 1);
|
||||
tcode += GET(tcode, 1);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
tcode++;
|
||||
map = (pcre_uint8 *)tcode;
|
||||
tcode += 32 / sizeof(pcre_uchar);
|
||||
}
|
||||
|
||||
/* In UTF-8 mode, the bits in a bit map correspond to character
|
||||
values, not to byte values. However, the bit map we are constructing is
|
||||
@@ -1246,42 +1302,49 @@ do
|
||||
value is > 127. In fact, there are only two possible starting bytes for
|
||||
characters in the range 128 - 255. */
|
||||
|
||||
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
|
||||
if (utf)
|
||||
{
|
||||
for (c = 0; c < 16; c++) start_bits[c] |= map[c];
|
||||
for (c = 128; c < 256; c++)
|
||||
{
|
||||
if ((map[c/8] && (1 << (c&7))) != 0)
|
||||
{
|
||||
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
|
||||
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
|
||||
c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
#if defined SUPPORT_UTF || !defined COMPILE_PCRE8
|
||||
if (map != NULL)
|
||||
#endif
|
||||
{
|
||||
/* In non-UTF-8 mode, the two bit maps are completely compatible. */
|
||||
for (c = 0; c < 32; c++) start_bits[c] |= map[c];
|
||||
#if defined SUPPORT_UTF && defined COMPILE_PCRE8
|
||||
if (utf)
|
||||
{
|
||||
for (c = 0; c < 16; c++) start_bits[c] |= map[c];
|
||||
for (c = 128; c < 256; c++)
|
||||
{
|
||||
if ((map[c/8] && (1 << (c&7))) != 0)
|
||||
{
|
||||
int d = (c >> 6) | 0xc0; /* Set bit for this starter */
|
||||
start_bits[d/8] |= (1 << (d&7)); /* and then skip on to the */
|
||||
c = (c & 0xc0) + 0x40 - 1; /* next relevant character. */
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
/* In non-UTF-8 mode, the two bit maps are completely compatible. */
|
||||
for (c = 0; c < 32; c++) start_bits[c] |= map[c];
|
||||
}
|
||||
}
|
||||
|
||||
/* Advance past the bit map, and act on what follows. For a zero
|
||||
minimum repeat, continue; otherwise stop processing. */
|
||||
|
||||
tcode += 32 / sizeof(pcre_uchar);
|
||||
switch (*tcode)
|
||||
{
|
||||
case OP_CRSTAR:
|
||||
case OP_CRMINSTAR:
|
||||
case OP_CRQUERY:
|
||||
case OP_CRMINQUERY:
|
||||
case OP_CRPOSSTAR:
|
||||
case OP_CRPOSQUERY:
|
||||
tcode++;
|
||||
break;
|
||||
|
||||
case OP_CRRANGE:
|
||||
case OP_CRMINRANGE:
|
||||
case OP_CRPOSRANGE:
|
||||
if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
|
||||
else try_next = FALSE;
|
||||
break;
|
||||
@@ -1346,6 +1409,7 @@ pcre_uchar *code;
|
||||
compile_data compile_block;
|
||||
const REAL_PCRE *re = (const REAL_PCRE *)external_re;
|
||||
|
||||
|
||||
*errorptr = NULL;
|
||||
|
||||
if (re == NULL || re->magic_number != MAGIC_NUMBER)
|
||||
@@ -1422,7 +1486,7 @@ if ((re->options & PCRE_ANCHORED) == 0 &&
|
||||
|
||||
/* Find the minimum length of subject string. */
|
||||
|
||||
switch(min = find_minlength(code, code, re->options, 0))
|
||||
switch(min = find_minlength(re, code, code, re->options, 0))
|
||||
{
|
||||
case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
|
||||
case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
|
||||
|
||||
Reference in New Issue
Block a user