amxmodx/modules/regex/CRegEx.cpp

// vim: set ts=4 sw=4 tw=99 noet:
//
// AMX Mod X, based on AMX Mod by Aleksander Naszko ("OLO").
// Copyright (C) The AMX Mod X Development Team.
//
// This software is licensed under the GNU General Public License, version 3 or higher.
// Additional exceptions apply. For full license details, see LICENSE.txt or visit:
//     https://alliedmods.net/amxmodx-license

//
// Regular Expressions Module
//

#include "amxxmodule.h"
#include "pcre.h"
#include "CRegEx.h"
#include <string.h>
#include <ctype.h>
#include "utils.h"

RegEx::RegEx()
{
	mErrorOffset = 0;
	mError = NULL;
	re = NULL;
	mFree = true;
	subject = NULL;
	mSubStrings.clear();
	mMatchesSubs.clear();
	mSubsNameTable.clear();
	mNumSubpatterns = 0;
}

void RegEx::Clear()
{
	mErrorOffset = 0;
	mError = NULL;
	if (re)
		pcre_free(re);
	re = NULL;
	mFree = true;
	if (subject)
		delete[] subject;
	subject = NULL;
	mSubStrings.clear();
	mMatchesSubs.clear();
	mSubsNameTable.clear();
	mNumSubpatterns = 0;
}

RegEx::~RegEx()
{
	Clear();
}

bool RegEx::isFree(bool set, bool val)
{
	if (set)
	{
		mFree = val;
		return true;
	} else {
		return mFree;
	}
}

int RegEx::Compile(const char *pattern, const char* flags)
{
	if (!mFree)
		Clear();


	int iFlags = 0;

	if (flags != NULL)
	{
		for ( ; *flags != 0; flags++)
		{
			switch (*flags)
			{
				case 'i':
				{
					iFlags |= PCRE_CASELESS;
					break;
				}
				case 'm':
				{
					iFlags |= PCRE_MULTILINE;
					break;
				}
				case 's':
				{
					iFlags |= PCRE_DOTALL;
					break;
				}
				case 'x':
				{
					iFlags |= PCRE_EXTENDED;
					break;
				}
				default:
				{
					break;
				}
			}
		}
	}

	re = pcre_compile(pattern, iFlags, &mError, &mErrorOffset, NULL);

	if (re == NULL)
	{
		return 0;
	}

	mFree = false;

	return 1;
}

int RegEx::Compile(const char *pattern, int iFlags)
{
	if (!mFree)
		Clear();

	re = pcre_compile(pattern, iFlags, &mError, &mErrorOffset, NULL);

	if (re == NULL)
	{
		return 0;
	}

	mFree = false;

	/**
	 * Retrieve the number of captured groups
	 * including the full match.
	 */
	pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &mNumSubpatterns);
	++mNumSubpatterns;

	/**
	 * Build the table with the named groups,
	 * which contain an index and a name per group.
	 */
	MakeSubpatternsTable(mNumSubpatterns);

	return 1;
}

int RegEx::Match(const char *str)
{
	int rc = 0;

	if (mFree || re == NULL)
		return -1;

	ClearMatch();

	//save str
	subject = new char[strlen(str) + 1];
	strcpy(subject, str);

	rc = pcre_exec(re, NULL, subject, (int)strlen(subject), 0, 0, ovector, REGEX_MAX_SUBPATTERNS);

	if (rc < 0)
	{
		if (rc == PCRE_ERROR_NOMATCH)
		{
			return 0;
		}
		else {
			mErrorOffset = rc;
			return -1;
		}
	}

	RegExSub res;
	mSubStrings.ensure(rc);

	for (int s = 0; s < rc; ++s)
	{
		res.start = ovector[2 * s];
		res.end = ovector[2 * s + 1];
		mSubStrings.append(res);
	}

	return 1;
}

int RegEx::MatchAll(const char *str)
{
	int rr = 0;
	int rc = 0;
	int startOffset = 0;
	int exoptions = 0;
	int notEmpty = 0;
	int sizeOffsets = mNumSubpatterns * 3;
	int subjectLen = strlen(str);

	if (mFree || re == NULL)
	{
		return -1;
	}

	ClearMatch();

	subject = new char[subjectLen + 1];
	strcpy(subject, str);

	RegExSub sub;

	while (1)
	{
		rr = pcre_exec(re, NULL, subject, (int)subjectLen, startOffset, exoptions | notEmpty, ovector, REGEX_MAX_SUBPATTERNS);

		/**
		 * The string was already proved to be valid UTF-8
		 */
		exoptions |= PCRE_NO_UTF8_CHECK;

		/**
		 * Too many substrings
		 */
		if (rr == 0)
		{
			rr = sizeOffsets / 3;
		}

		if (rr > 0)
		{
			mMatchesSubs.append(rr);

			for (int s = 0; s < rr; ++s)
			{
				sub.start = ovector[2 * s];
				sub.end = ovector[2 * s + 1];

				mSubStrings.append(sub);
			}
		}
		else if (rr == PCRE_ERROR_NOMATCH)
		{
			/**
			 * If we previously set PCRE_NOTEMPTY after a null match,
			 * this is not necessarily the end. We need to advance
			 * the start offset, and continue. Fudge the offset values
			 * to achieve this, unless we're already at the end of the string.
			 */
			if (notEmpty && startOffset < (int)subjectLen)
			{
				ovector[0] = startOffset;
				ovector[1] = startOffset + 1;
			}
			else
			{
				break;
			}
		}
		else
		{
			mErrorOffset = rr;

			if (mMatchesSubs.length())
			{
				ClearMatch();
			}

			return -1;
		}

		/**
		 * If we have matched an empty string, mimic what Perl's /g options does.
		 * This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
		 * the match again at the same point. If this fails (picked up above) we
		 * advance to the next character.
		 */
		notEmpty = (ovector[1] == ovector[0]) ? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;

		/**
		 * Advance to the next piece.
		 */
		startOffset = ovector[1];
	}

	if (!mMatchesSubs.length())
	{
		return 0;
	}

	return 1;
}

void RegEx::ClearMatch()
{
	// Clears match results
	mErrorOffset = 0;
	mError = NULL;
	if (subject)
		delete[] subject;
	subject = NULL;
	mSubStrings.clear();
	mMatchesSubs.clear();
}

const char *getSubstring(char *subject, size_t start, size_t end, char buffer[], size_t max, size_t *outlen)
{
	size_t i;
	char * substr_a = subject + start;
	size_t substr_l = end - start;

	for (i = 0; i < substr_l; i++)
	{
		if (i >= max)
			break;
		buffer[i] = substr_a[i];
	}

	buffer[i] = '\0';

	if (outlen)
	{
		*outlen = i;
	}

	return buffer;
}

const char *RegEx::GetSubstring(size_t start, char buffer[], size_t max, size_t *outlen)
{
	if (start >= mSubStrings.length())
	{
		return NULL;
	}

	RegExSub sub = mSubStrings.at(start);

	return getSubstring(subject, sub.start, sub.end, buffer, max, outlen);
}

void RegEx::MakeSubpatternsTable(int numSubpatterns)
{
	int nameCount = 0;
	int rc = pcre_fullinfo(re, NULL, PCRE_INFO_NAMECOUNT, &nameCount);

	if (rc < 0)
	{
		return;
	}

	if (nameCount > 0)
	{
		const char *nameTable;
		int nameSize = 0;
		int i = 0;

		int rc1 = pcre_fullinfo(re, NULL, PCRE_INFO_NAMETABLE, &nameTable);
		int rc2 = pcre_fullinfo(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameSize);

		rc = rc2 ? rc2 : rc1;

		if (rc < 0)
		{
			mSubsNameTable.clear();
			return;
		}

		NamedGroup data;

		while (i++ < nameCount)
		{
			data.index = 0xff * (unsigned char)nameTable[0] + (unsigned char)nameTable[1];
			data.name = nameTable + 2;

			mSubsNameTable.append(ke::Move(data));
			nameTable += nameSize;
		}
	}
}

int RegEx::Replace(char *text, size_t textMaxLen, const char *replace, size_t replaceLen, int flags)
{
	char *output = text;

	/**
	 * Retrieve all matches and store them in
	 * mSubStrings list.
	 */
	if (MatchAll(output) == -1)
	{
		return -1;
	}

	size_t subjectLen = strlen(subject);
	size_t total = 0;
	size_t baseIndex = 0;
	size_t diffLength = 0;

	char *toReplace = new char[textMaxLen + 1];
	char *toSearch = NULL;

	/**
	 * All characters which is not matched are not copied when replacing matches.
	 * Then original text (output buffer) should be considerated as empty.
	 */
	if (flags & REGEX_FORMAT_NOCOPY)
	{
		*output = '\0';
	}
	else
	{
		/**
		 * This is used only when we do replace matches.
		 */
		toSearch  = new char[textMaxLen + 1];
	}

	/**
	 * Loop over all matches found.
	 */
	for (size_t i = 0; i < mMatchesSubs.length(); ++i)
	{
		char *ptr = toReplace;

		size_t browsed = 0;
		size_t searchLen = 0;
		size_t length = 0;

		/**
		 * Build the replace string as it can contain backreference
		 * and this needs to be parsed.
		 */
		for (const char *s = replace, *end = s + replaceLen; s < end && browsed <= textMaxLen; ++s, ++browsed)
		{
			unsigned int c = *s;

			/**
			 * Supported format specifiers:
			 *
			 *   $number  : Substitutes the substring matched by group number.
			 *              n must be an integer value designating a valid backreference, greater than 0, and of two digits at most.
			 *   ${name}  : Substitutes the substring matched by the named group name (a maximum of 32 characters).
			 *   $&       : Substitutes a copy of the whole match.
			 *   $`       : Substitutes all the text of the input string before the match.
			 *   $'       : Substitutes all the text of the input string after the match.
			 *   $+       : Substitutes the last group that was captured.
			 *   $_       : Substitutes the entire input string.
			 *   $$       : Substitutes a literal "$".
			 */
			if (c == '$' || c == '\\')
			{
				switch (*++s)
				{
					case '\0':
					{
						/**
						 * End of string.
						 * Copy one character.
						 */
						 *(ptr + browsed) = c;
						 break;
					}
					case '&':
					{
						/**
						 * Concatenate retrieved full match sub-string.
						 * length - 1 to overwrite EOS.
						 */
						GetSubstring(baseIndex, ptr + browsed, textMaxLen, &length);
						browsed += length - 1;
						break;
					}
					case '`':
					{
						/**
						 * Concatenate part of original text up to
						 * first sub-string position.
						 */
						length = mSubStrings.at(baseIndex).start;
						memcpy(ptr + browsed, subject, length);
						browsed += length - 1;
						break;
					}
					case '\'':
					{
						/**
						 * Concatenate part of original text from
						 * last sub-string end position to EOS.
						 */
						length = mSubStrings.at(baseIndex).end;
						memcpy(ptr + browsed, subject + length, subjectLen - length);
						browsed += (subjectLen - length) - 1;
						break;
					}
					case '+':
					{
						/**
						 * Copy the last group that was captured.
						 */
						GetSubstring(baseIndex + mMatchesSubs.at(i) - 1, ptr + browsed, textMaxLen, &length);
						browsed += length - 1;
						break;
					}
					case '_':
					{
						/**
						 * Copy the entire input string.
						 */
						memcpy(ptr + browsed, subject, subjectLen);
						browsed += (subjectLen - 1);
						break;
					}
					case '$':
					case '\\':
					{
						/**
						 * Copy the single character $ or \.
						 */
						*(ptr + browsed) = c;
						break;
					}
					case '0': case '1':	case '2': case '3':	case '4':
					case '5': case '6': case '7': case '8': case '9':
					case '{':
					{
						/**
						 * Checking backreference.
						 * Which can be either $n, ${n} or ${name}.
						 */
						int backref = -1;
						const char *walk = s;
						bool inBrace = false;
						bool nameCheck = false;

						/**
						 * ${nn}.
						 *  ^
						 */
						if (*walk == '{')
						{
							inBrace = true;
							++walk;
						}

						/**
						 * Valid number.
						 * $nn or ${nn}
						 *  ^       ^
						 */
						if (*walk >= '0' && *walk <= '9')
						{
							backref = *walk - '0';
							++walk;
						}
						else if (inBrace)
						{
							nameCheck = true;

							/**
							 * Not a valid number.
							 * Checking as string.
							 * ${name}
							 *   ^
							 */
							if (*walk)
							{
								const char *pch = strchr(walk, '}');

								if (pch != NULL)
								{
									/**
									 * A named group maximum character is 32 (PCRE).
									 */
									char name[32];
									size_t nameLength = strncopy(name, walk, pch - walk + 1);

									int flags, num = 0;
									pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &flags);

									/**
									 * If PCRE_DUPNAMES is set, the pcre_copy_named_substring function should be used
									 * as pcre_get_stringnumber output order is not defined.
									 */
									if (flags & PCRE_DUPNAMES)
									{
										memset(ovector, 0, REGEX_MAX_SUBPATTERNS * sizeof(int));

										/**
										 * pcre_copy_named_substring needs a vector containing sub-patterns ranges
										 * for a given match.
										 */
										for (size_t j = 0; j < mMatchesSubs.at(i); ++j)
										{
											ovector[2 * j] = mSubStrings.at(baseIndex + j).start;
											ovector[2 * j + 1] = mSubStrings.at(baseIndex + j).end;
										}

										num = pcre_copy_named_substring(re, subject, ovector, mMatchesSubs.at(i), name, ptr + browsed, (int)textMaxLen);

										if (num != PCRE_ERROR_NOSUBSTRING)
										{
											browsed += num - 1;
											s = pch;
											break;
										}
										++pch;
									}
									else
									{
										/**
										 * Retrieve sub-pattern index from a give name.
										 */
										num = pcre_get_stringnumber(re, name);
										if (num != PCRE_ERROR_NOSUBSTRING)
										{
											backref = num;
											walk = ++pch;
										}
									}

									if (num == PCRE_ERROR_NOSUBSTRING || num >= (int)mMatchesSubs.at(i))
									{
										/**
										 * If a sub-string for a given match is not found,  or if > to
										 * number of sub-patterns we still need to check if this
										 * group name is a valid one because if so we want to escape it.
										 * Looking at the name table.
										 */
										bool found = false;
										for (size_t i = 0; i < mSubsNameTable.length(); ++i)
										{
											if (!mSubsNameTable.at(i).name.compare(name))
											{
												--browsed;
												s = --pch;
												found = true;
												break;
											}
										}

										if (found)
										{
											continue;
										}
									}
								}
							}
						}

						if (!nameCheck)
						{
							/**
							 * Valid second number.
							 * $nn or ${nn}
							 *   ^       ^
							 */
							if (*walk >= '0' && *walk <= '9')
							{
								backref = backref * 10 + *walk - '0';
								++walk;
							}

							if (inBrace)
							{
								/**
								 * Invalid specifier
								 * Either hit EOS or missing }.
								 * ${n  or ${nn  or ${nx or ${nnx
								 *    ^        ^       ^        ^
								 */
								if (*walk == '\0' || *walk != '}')
								{
									backref = -1;
								}
								else
								{
									++walk;
								}
							}
						}

						length = walk - s;
						s = --walk;

						/**
						 * We can't provide a capture number >= to total that pcre_exec has found.
						 * 0 is implicitly accepted, same behavior as $&.
						 */
						if (backref >= 0 && backref < mNumSubpatterns)
						{
							/**
							 * Valid available index for a given match.
							 */
							if ((size_t)backref < mMatchesSubs.at(i))
							{
								/**
								 * Concatenate retrieved sub-string.
								 * length - 1 to overwrite EOS.
								 */
								GetSubstring(baseIndex + backref, ptr + browsed, textMaxLen, &length);
								browsed += length - 1;
							}
							else
							{
								/**
								 * Valid unavailable index for a given match.
								 */
								--browsed;
							}
						}
						else
						{
							/**
							 * If we here it means the syntax is valid but sub-pattern doesn't exist.
							 * So, copy as it is, including $.
							 */
							memcpy(ptr + browsed, s - length, length + 1);
							browsed += length;
						}

						break;
					}
					default:
					{
						/**
						 * Not a valid format modifier.
						 * So we copy characters as it is.
						 */
						*(ptr + browsed) = *s;
						break;
					}
				}
			}
			else
			{
				/**
				 * At this point, direct copy.
				 */
				*(ptr + browsed) = c;
			}
		}

		*(ptr + browsed) = '\0';

		/**
		 * Concatenate only replace string of each match,
		 * as we don't want to copy unmatched characters.
		 */
		if (flags & REGEX_FORMAT_NOCOPY)
		{
			/**
			 * We want just the first occurrence.
			 */
			if (total++ && (flags & REGEX_FORMAT_FIRSTONLY))
			{
				break;
			}

			strncat(output, toReplace, textMaxLen + 1);
		}
		else
		{
			/**
			 * Retrieves full string of a given match.
			 */
			const char *search = GetSubstring(baseIndex, toSearch, textMaxLen, &searchLen);

			/**
			 * We get something to replace, but the sub-pattern to search is empty.
			 * We insert replacement either a the start end or string.
			 */
			if (*toReplace && !searchLen)
			{
				if (output - text > 0)
				{
					strncat(output, toReplace, textMaxLen);
				}
				else
				{
					strncat(toReplace, text, textMaxLen);
					strncopy(text, toReplace, strlen(toReplace) + 1);
				}

				++total;
			}
			else if ((output = UTIL_ReplaceEx(text + mSubStrings.at(baseIndex).start + diffLength, textMaxLen, search, searchLen, toReplace, browsed, false)) != NULL)
			{
				/**
				 * Then we simply do a replace.
				 * Probably not the most efficient, but this should be at least safe.
				 * To avoid issue where the function could find a string which is not at the expected index,
				 * We force the input string to start from index of the full match.
				 */
				++total;
			}

			if (total && (flags & REGEX_FORMAT_FIRSTONLY))
			{
				break;
			}
		}

		/**
		 * mMatchesSubs is a flat list containing all sub-patterns of all matches.
		 * A number of sub-patterns can vary per match. So we calculate the position in the list,
		 * from where the first sub-pattern result of current match starts.
		 */
		baseIndex  += mMatchesSubs.at(i);
		diffLength += browsed - searchLen;
	}

	delete[] toReplace;

	if (toSearch != NULL)
	{
		delete[] toSearch;
	}

	/**
	 * Return the number of successful replacements.
	 */
	return total;
}