amxmodx/dlls/regex/CRegEx.cpp
2014-07-17 11:21:06 +02:00

853 lines
18 KiB
C++
Executable File

/* AMX Mod X
* Regular Expressions Module
*
* by the AMX Mod X Development Team
*
* This file is part of AMX Mod X.
*
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* In addition, as a special exception, the author gives permission to
* link the code of this program with the Half-Life Game Engine ("HL
* Engine") and Modified Game Libraries ("MODs") developed by Valve,
* L.L.C ("Valve"). You must obey the GNU General Public License in all
* respects for all of the code used other than the HL Engine and MODs
* from Valve. If you modify this file, you may extend this exception
* to your version of the file, but you are not obligated to do so. If
* you do not wish to do so, delete this exception statement from your
* version.
*/
#include "amxxmodule.h"
#include "pcre.h"
#include "CRegEx.h"
#include <string.h>
#include <ctype.h>
#include "utils.h"
RegEx::RegEx()
{
mErrorOffset = 0;
mError = NULL;
re = NULL;
mFree = true;
subject = NULL;
mSubStrings.clear();
mMatchesSubs.clear();
mSubsNameTable.clear();
mNumSubpatterns = 0;
}
void RegEx::Clear()
{
mErrorOffset = 0;
mError = NULL;
if (re)
pcre_free(re);
re = NULL;
mFree = true;
if (subject)
delete[] subject;
subject = NULL;
mSubStrings.clear();
mMatchesSubs.clear();
mSubsNameTable.clear();
mNumSubpatterns = 0;
}
RegEx::~RegEx()
{
Clear();
}
bool RegEx::isFree(bool set, bool val)
{
if (set)
{
mFree = val;
return true;
} else {
return mFree;
}
}
int RegEx::Compile(const char *pattern, const char* flags)
{
if (!mFree)
Clear();
int iFlags = 0;
if (flags != NULL)
{
for ( ; *flags != 0; flags++)
{
switch (*flags)
{
case 'i':
{
iFlags |= PCRE_CASELESS;
break;
}
case 'm':
{
iFlags |= PCRE_MULTILINE;
break;
}
case 's':
{
iFlags |= PCRE_DOTALL;
break;
}
case 'x':
{
iFlags |= PCRE_EXTENDED;
break;
}
default:
{
break;
}
}
}
}
re = pcre_compile(pattern, iFlags, &mError, &mErrorOffset, NULL);
if (re == NULL)
{
return 0;
}
mFree = false;
return 1;
}
int RegEx::Compile(const char *pattern, int iFlags)
{
if (!mFree)
Clear();
re = pcre_compile(pattern, iFlags, &mError, &mErrorOffset, NULL);
if (re == NULL)
{
return 0;
}
mFree = false;
/**
* Retrieve the number of captured groups
* including the full match.
*/
pcre_fullinfo(re, NULL, PCRE_INFO_CAPTURECOUNT, &mNumSubpatterns);
++mNumSubpatterns;
/**
* Build the table with the named groups,
* which contain an index and a name per group.
*/
MakeSubpatternsTable(mNumSubpatterns);
return 1;
}
int RegEx::Match(const char *str)
{
int rc = 0;
if (mFree || re == NULL)
return -1;
ClearMatch();
//save str
subject = new char[strlen(str) + 1];
strcpy(subject, str);
rc = pcre_exec(re, NULL, subject, (int)strlen(subject), 0, 0, ovector, REGEX_MAX_SUBPATTERNS);
if (rc < 0)
{
if (rc == PCRE_ERROR_NOMATCH)
{
return 0;
}
else {
mErrorOffset = rc;
return -1;
}
}
RegExSub res;
mSubStrings.ensure(rc);
for (int s = 0; s < rc; ++s)
{
res.start = ovector[2 * s];
res.end = ovector[2 * s + 1];
mSubStrings.append(res);
}
return 1;
}
int RegEx::MatchAll(const char *str)
{
int rr = 0;
int rc = 0;
int startOffset = 0;
int exoptions = 0;
int notEmpty = 0;
int sizeOffsets = mNumSubpatterns * 3;
int subjectLen = strlen(str);
if (mFree || re == NULL)
{
return -1;
}
ClearMatch();
subject = new char[subjectLen + 1];
strcpy(subject, str);
RegExSub sub;
while (1)
{
rr = pcre_exec(re, NULL, subject, (int)subjectLen, startOffset, exoptions | notEmpty, ovector, REGEX_MAX_SUBPATTERNS);
/**
* The string was already proved to be valid UTF-8
*/
exoptions |= PCRE_NO_UTF8_CHECK;
/**
* Too many substrings
*/
if (rr == 0)
{
rr = sizeOffsets / 3;
}
if (rr > 0)
{
mMatchesSubs.append(rr);
for (int s = 0; s < rr; ++s)
{
sub.start = ovector[2 * s];
sub.end = ovector[2 * s + 1];
mSubStrings.append(sub);
}
}
else if (rr == PCRE_ERROR_NOMATCH)
{
/**
* If we previously set PCRE_NOTEMPTY after a null match,
* this is not necessarily the end. We need to advance
* the start offset, and continue. Fudge the offset values
* to achieve this, unless we're already at the end of the string.
*/
if (notEmpty && startOffset < (int)subjectLen)
{
ovector[0] = startOffset;
ovector[1] = startOffset + 1;
}
else
{
break;
}
}
else
{
mErrorOffset = rr;
if (mMatchesSubs.length())
{
ClearMatch();
}
return -1;
}
/**
* If we have matched an empty string, mimic what Perl's /g options does.
* This turns out to be rather cunning. First we set PCRE_NOTEMPTY and try
* the match again at the same point. If this fails (picked up above) we
* advance to the next character.
*/
notEmpty = (ovector[1] == ovector[0]) ? PCRE_NOTEMPTY | PCRE_ANCHORED : 0;
/**
* Advance to the next piece.
*/
startOffset = ovector[1];
}
if (!mMatchesSubs.length())
{
return 0;
}
return 1;
}
void RegEx::ClearMatch()
{
// Clears match results
mErrorOffset = 0;
mError = NULL;
if (subject)
delete[] subject;
subject = NULL;
mSubStrings.clear();
mMatchesSubs.clear();
}
const char *getSubstring(char *subject, size_t start, size_t end, char buffer[], size_t max, size_t *outlen)
{
size_t i;
char * substr_a = subject + start;
size_t substr_l = end - start;
for (i = 0; i < substr_l; i++)
{
if (i >= max)
break;
buffer[i] = substr_a[i];
}
buffer[i] = '\0';
if (outlen)
{
*outlen = i;
}
return buffer;
}
const char *RegEx::GetSubstring(size_t start, char buffer[], size_t max, size_t *outlen)
{
if (start < 0 || start >= mSubStrings.length())
{
return NULL;
}
RegExSub sub = mSubStrings.at(start);
return getSubstring(subject, sub.start, sub.end, buffer, max, outlen);
}
const char *RegEx::GetSubstring(size_t startOffset, size_t endOffset, char buffer[], size_t max, size_t *outlen)
{
if (startOffset < 0 || endOffset < 0)
{
return NULL;
}
return getSubstring(subject, startOffset, endOffset, buffer, max, outlen);
}
void RegEx::MakeSubpatternsTable(int numSubpatterns)
{
int nameCount = 0;
int rc = pcre_fullinfo(re, NULL, PCRE_INFO_NAMECOUNT, &nameCount);
if (rc < 0)
{
return;
}
if (nameCount > 0)
{
const char *nameTable;
int nameSize = 0;
int i = 0;
int rc1 = pcre_fullinfo(re, NULL, PCRE_INFO_NAMETABLE, &nameTable);
int rc2 = pcre_fullinfo(re, NULL, PCRE_INFO_NAMEENTRYSIZE, &nameSize);
rc = rc2 ? rc2 : rc1;
if (rc < 0)
{
mSubsNameTable.clear();
return;
}
NamedGroup data;
while (i++ < nameCount)
{
data.index = 0xff * (unsigned char)nameTable[0] + (unsigned char)nameTable[1];
data.name = nameTable + 2;
mSubsNameTable.append(ke::Move(data));
nameTable += nameSize;
}
}
}
int RegEx::Replace(char *text, size_t textMaxLen, const char *replace, size_t replaceLen, int flags)
{
char *output = text;
/**
* Retrieve all matches and store them in
* mSubStrings list.
*/
if (MatchAll(output) == -1)
{
return -1;
}
size_t subjectLen = strlen(subject);
size_t total = 0;
size_t baseIndex = 0;
size_t diffLength = 0;
char *toReplace = new char[textMaxLen + 1];
char *toSearch = NULL;
/**
* All characters which is not matched are not copied when replacing matches.
* Then original text (output buffer) should be considerated as empty.
*/
if (flags & REGEX_FORMAT_NOCOPY)
{
*output = '\0';
}
else
{
/**
* This is used only when we do replace matches.
*/
toSearch = new char[textMaxLen + 1];
}
/**
* Loop over all matches found.
*/
for (size_t i = 0; i < mMatchesSubs.length(); ++i)
{
char *ptr = toReplace;
size_t browsed = 0;
size_t searchLen = 0;
size_t length = 0;
/**
* Build the replace string as it can contain backreference
* and this needs to be parsed.
*/
for (const char *s = replace, *end = s + replaceLen; s < end && browsed <= textMaxLen; ++s, ++browsed)
{
unsigned int c = *s;
/**
* Supported format specifiers:
*
* $number : Substitutes the substring matched by group number.
* n must be an integer value designating a valid backreference, greater than 0, and of two digits at most.
* ${name} : Substitutes the substring matched by the named group name (a maximum of 32 characters).
* $& : Substitutes a copy of the whole match.
* $` : Substitutes all the text of the input string before the match.
* $' : Substitutes all the text of the input string after the match.
* $+ : Substitutes the last group that was captured.
* $_ : Substitutes the entire input string.
* $$ : Substitutes a literal "$".
*/
if (c == '$' || c == '\\')
{
switch (*++s)
{
case '\0':
{
/**
* End of string.
* Copy one character.
*/
*(ptr + browsed) = c;
break;
}
case '&':
{
/**
* Concatenate retrieved full match sub-string.
* length - 1 to overwrite EOS.
*/
GetSubstring(baseIndex, ptr + browsed, textMaxLen, &length);
browsed += length - 1;
break;
}
case '`':
{
/**
* Concatenate part of original text up to
* first sub-string position.
*/
length = mSubStrings.at(baseIndex).start;
memcpy(ptr + browsed, subject, length);
browsed += length - 1;
break;
}
case '\'':
{
/**
* Concatenate part of original text from
* last sub-string end position to EOS.
*/
length = mSubStrings.at(baseIndex).end;
memcpy(ptr + browsed, subject + length, subjectLen - length);
browsed += (subjectLen - length) - 1;
break;
}
case '+':
{
/**
* Copy the last group that was captured.
*/
GetSubstring(baseIndex + mMatchesSubs.at(i) - 1, ptr + browsed, textMaxLen, &length);
browsed += length - 1;
break;
}
case '_':
{
/**
* Copy the entire input string.
*/
memcpy(ptr + browsed, subject, subjectLen);
browsed += (subjectLen - 1);
break;
}
case '$':
case '\\':
{
/**
* Copy the single character $ or \.
*/
*(ptr + browsed) = c;
break;
}
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
case '{':
{
/**
* Checking backreference.
* Which can be either $n, ${n} or ${name}.
*/
size_t backref = -1;
const char *walk = s;
bool inBrace = false;
bool nameCheck = false;
/**
* ${nn}.
* ^
*/
if (*walk == '{')
{
inBrace = true;
++walk;
}
/**
* Valid number.
* $nn or ${nn}
* ^ ^
*/
if (*walk >= '0' && *walk <= '9')
{
backref = *walk - '0';
++walk;
}
else if (inBrace)
{
nameCheck = true;
/**
* Not a valid number.
* Checking as string.
* ${name}
* ^
*/
if (*walk)
{
const char *pch = strchr(walk, '}');
if (pch != NULL)
{
/**
* A named group maximum character is 32 (PCRE).
*/
char name[32];
size_t nameLength = strncopy(name, walk, pch - walk + 1);
int flags, num = 0;
pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &flags);
/**
* If PCRE_DUPNAMES is set, the pcre_copy_named_substring function should be used
* as pcre_get_stringnumber output order is not defined.
*/
if (flags & PCRE_DUPNAMES)
{
memset(ovector, 0, REGEX_MAX_SUBPATTERNS);
/**
* pcre_copy_named_substring needs a vector containing sub-patterns ranges
* for a given match.
*/
for (size_t j = 0; j < mMatchesSubs.at(i); ++j)
{
ovector[2 * j] = mSubStrings.at(baseIndex + j).start;
ovector[2 * j + 1] = mSubStrings.at(baseIndex + j).end;
}
num = pcre_copy_named_substring(re, subject, ovector, mMatchesSubs.at(i), name, ptr + browsed, (int)textMaxLen);
if (num != PCRE_ERROR_NOSUBSTRING)
{
browsed += num - 1;
s = pch;
break;
}
++pch;
}
else
{
/**
* Retrieve sub-pattern index from a give name.
*/
num = pcre_get_stringnumber(re, name);
if (num != PCRE_ERROR_NOSUBSTRING)
{
backref = num;
walk = ++pch;
}
}
if (num == PCRE_ERROR_NOSUBSTRING || num >= (int)mMatchesSubs.at(i))
{
/**
* If a sub-string for a given match is not found, or if > to
* number of sub-patterns we still need to check if this
* group name is a valid one because if so we want to escape it.
* Looking at the name table.
*/
bool found = false;
for (size_t i = 0; i < mSubsNameTable.length(); ++i)
{
if (!mSubsNameTable.at(i).name.compare(name))
{
--browsed;
s = --pch;
found = true;
break;
}
}
if (found)
{
continue;
}
}
}
}
}
if (!nameCheck)
{
/**
* Valid second number.
* $nn or ${nn}
* ^ ^
*/
if (*walk && *walk >= '0' && *walk <= '9')
{
backref = backref * 10 + *walk - '0';
++walk;
}
if (inBrace)
{
/**
* Invalid specifier
* Either hit EOS or missing }.
* ${n or ${nn or ${nx or ${nnx
* ^ ^ ^ ^
*/
if (*walk == '\0' || *walk != '}')
{
backref = -1;
}
else
{
++walk;
}
}
}
length = walk - s;
s = --walk;
/**
* We can't provide a capture number >= to total that pcre_exec has found.
* 0 is implicitly accepted, same behavior as $&.
*/
if (backref >= 0 && (int)backref < mNumSubpatterns)
{
/**
* Valid available index for a given match.
*/
if (backref < mMatchesSubs.at(i))
{
/**
* Concatenate retrieved sub-string.
* length - 1 to overwrite EOS.
*/
GetSubstring(baseIndex + backref, ptr + browsed, textMaxLen, &length);
browsed += length - 1;
}
else
{
/**
* Valid unavailable index for a given match.
*/
--browsed;
}
}
else
{
/**
* If we here it means the syntax is valid but sub-pattern doesn't exist.
* So, copy as it is, including $.
*/
memcpy(ptr + browsed, s - length, length + 1);
browsed += length;
}
break;
}
default:
{
/**
* Not a valid format modifier.
* So we copy characters as it is.
*/
*(ptr + browsed) = *s;
break;
}
}
}
else
{
/**
* At this point, direct copy.
*/
*(ptr + browsed) = c;
}
}
*(ptr + browsed) = '\0';
/**
* Concatenate only replace string of each match,
* as we don't want to copy unmatched characters.
*/
if (flags & REGEX_FORMAT_NOCOPY)
{
/**
* We want just the first occurrence.
*/
if (total++ && (flags & REGEX_FORMAT_FIRSTONLY))
{
break;
}
strncat(output, toReplace, textMaxLen + 1);
}
else
{
/**
* Retrieves full string of a given match.
*/
const char *search = GetSubstring(baseIndex, toSearch, textMaxLen, &searchLen);
/**
* We get something to replace, but the sub-pattern to search is empty.
* We insert replacement either a the start end or string.
*/
if (*toReplace && !searchLen)
{
if (output - text > 0)
{
strncat(output, toReplace, textMaxLen);
}
else
{
strncat(toReplace, text, textMaxLen);
strncopy(text, toReplace, strlen(toReplace) + 1);
}
++total;
}
else if ((output = UTIL_ReplaceEx(text + mSubStrings.at(baseIndex).start + diffLength, textMaxLen, search, searchLen, toReplace, browsed, false)) != NULL)
{
/**
* Then we simply do a replace.
* Probably not the most efficient, but this should be at least safe.
* To avoid issue where the function could find a string which is not at the expected index,
* We force the input string to start from index of the full match.
*/
++total;
}
if (total && (flags & REGEX_FORMAT_FIRSTONLY))
{
break;
}
}
/**
* mMatchesSubs is a flat list containing all sub-patterns of all matches.
* A number of sub-patterns can vary per match. So we calculate the position in the list,
* from where the first sub-pattern result of current match starts.
*/
baseIndex += mMatchesSubs.at(i);
diffLength += browsed - searchLen;
}
delete[] toReplace;
if (toSearch != NULL)
{
delete[] toSearch;
}
/**
* Return the number of successful replacements.
*/
return total;
}