Implements a simple regex matcher (to be used by death tests on Windows).

This commit is contained in:
zhanyong.wan
2009-01-29 01:28:52 +00:00
parent a32fc79c9a
commit c946ae6019
6 changed files with 854 additions and 27 deletions

View File

@@ -39,6 +39,10 @@
#include <regex.h>
#endif // GTEST_HAS_DEATH_TEST
#if GTEST_USES_SIMPLE_RE
#include <string.h>
#endif
#ifdef _WIN32_WCE
#include <windows.h> // For TerminateProcess()
#endif // _WIN32_WCE
@@ -47,11 +51,19 @@
#include <gtest/gtest-message.h>
#include <gtest/internal/gtest-string.h>
// Indicates that this translation unit is part of Google Test's
// implementation. It must come before gtest-internal-inl.h is
// included, or there will be a compiler error. This trick is to
// prevent a user from accidentally including gtest-internal-inl.h in
// his code.
#define GTEST_IMPLEMENTATION
#include "src/gtest-internal-inl.h"
#undef GTEST_IMPLEMENTATION
namespace testing {
namespace internal {
#ifdef GTEST_HAS_DEATH_TEST
#if GTEST_USES_POSIX_RE
// Implements RE. Currently only needed for death tests.
@@ -101,7 +113,262 @@ void RE::Init(const char* regex) {
delete[] full_pattern;
}
#endif // GTEST_HAS_DEATH_TEST
#elif GTEST_USES_SIMPLE_RE
// Returns true iff ch appears anywhere in str (excluding the
// terminating '\0' character).
bool IsInSet(char ch, const char* str) {
return ch != '\0' && strchr(str, ch) != NULL;
}
// Returns true iff ch belongs to the given classification. Unlike
// similar functions in <ctype.h>, these aren't affected by the
// current locale.
bool IsDigit(char ch) { return '0' <= ch && ch <= '9'; }
bool IsPunct(char ch) {
return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~");
}
bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
bool IsWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
bool IsWordChar(char ch) {
return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
('0' <= ch && ch <= '9') || ch == '_';
}
// Returns true iff "\\c" is a supported escape sequence.
bool IsValidEscape(char c) {
return (IsPunct(c) || IsInSet(c, "dDfnrsStvwW"));
}
// Returns true iff the given atom (specified by escaped and pattern)
// matches ch. The result is undefined if the atom is invalid.
bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
if (escaped) { // "\\p" where p is pattern_char.
switch (pattern_char) {
case 'd': return IsDigit(ch);
case 'D': return !IsDigit(ch);
case 'f': return ch == '\f';
case 'n': return ch == '\n';
case 'r': return ch == '\r';
case 's': return IsWhiteSpace(ch);
case 'S': return !IsWhiteSpace(ch);
case 't': return ch == '\t';
case 'v': return ch == '\v';
case 'w': return IsWordChar(ch);
case 'W': return !IsWordChar(ch);
}
return IsPunct(pattern_char) && pattern_char == ch;
}
return (pattern_char == '.' && ch != '\n') || pattern_char == ch;
}
// Helper function used by ValidateRegex() to format error messages.
String FormatRegexSyntaxError(const char* regex, int index) {
return (Message() << "Syntax error at index " << index
<< " in simple regular expression \"" << regex << "\": ").GetString();
}
// Generates non-fatal failures and returns false if regex is invalid;
// otherwise returns true.
bool ValidateRegex(const char* regex) {
if (regex == NULL) {
// TODO(wan@google.com): fix the source file location in the
// assertion failures to match where the regex is used in user
// code.
ADD_FAILURE() << "NULL is not a valid simple regular expression.";
return false;
}
bool is_valid = true;
// True iff ?, *, or + can follow the previous atom.
bool prev_repeatable = false;
for (int i = 0; regex[i]; i++) {
if (regex[i] == '\\') { // An escape sequence
i++;
if (regex[i] == '\0') {
ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
<< "'\\' cannot appear at the end.";
return false;
}
if (!IsValidEscape(regex[i])) {
ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1)
<< "invalid escape sequence \"\\" << regex[i] << "\".";
is_valid = false;
}
prev_repeatable = true;
} else { // Not an escape sequence.
const char ch = regex[i];
if (ch == '^' && i > 0) {
ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
<< "'^' can only appear at the beginning.";
is_valid = false;
} else if (ch == '$' && regex[i + 1] != '\0') {
ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
<< "'$' can only appear at the end.";
is_valid = false;
} else if (IsInSet(ch, "()[]{}|")) {
ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
<< "'" << ch << "' is unsupported.";
is_valid = false;
} else if (IsRepeat(ch) && !prev_repeatable) {
ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
<< "'" << ch << "' can only follow a repeatable token.";
is_valid = false;
}
prev_repeatable = !IsInSet(ch, "^$?*+");
}
}
return is_valid;
}
// Matches a repeated regex atom followed by a valid simple regular
// expression. The regex atom is defined as c if escaped is false,
// or \c otherwise. repeat is the repetition meta character (?, *,
// or +). The behavior is undefined if str contains too many
// characters to be indexable by size_t, in which case the test will
// probably time out anyway. We are fine with this limitation as
// std::string has it too.
bool MatchRepetitionAndRegexAtHead(
bool escaped, char c, char repeat, const char* regex,
const char* str) {
const size_t min_count = (repeat == '+') ? 1 : 0;
const size_t max_count = (repeat == '?') ? 1 :
static_cast<size_t>(-1) - 1;
// We cannot call numeric_limits::max() as it conflicts with the
// max() macro on Windows.
for (size_t i = 0; i <= max_count; ++i) {
// We know that the atom matches each of the first i characters in str.
if (i >= min_count && MatchRegexAtHead(regex, str + i)) {
// We have enough matches at the head, and the tail matches too.
// Since we only care about *whether* the pattern matches str
// (as opposed to *how* it matches), there is no need to find a
// greedy match.
return true;
}
if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
return false;
}
return false;
}
// Returns true iff regex matches a prefix of str. regex must be a
// valid simple regular expression and not start with "^", or the
// result is undefined.
bool MatchRegexAtHead(const char* regex, const char* str) {
if (*regex == '\0') // An empty regex matches a prefix of anything.
return true;
// "$" only matches the end of a string. Note that regex being
// valid guarantees that there's nothing after "$" in it.
if (*regex == '$')
return *str == '\0';
// Is the first thing in regex an escape sequence?
const bool escaped = *regex == '\\';
if (escaped)
++regex;
if (IsRepeat(regex[1])) {
// MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
// here's an indirect recursion. It terminates as the regex gets
// shorter in each recursion.
return MatchRepetitionAndRegexAtHead(
escaped, regex[0], regex[1], regex + 2, str);
} else {
// regex isn't empty, isn't "$", and doesn't start with a
// repetition. We match the first atom of regex with the first
// character of str and recurse.
return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
MatchRegexAtHead(regex + 1, str + 1);
}
}
// Returns true iff regex matches any substring of str. regex must be
// a valid simple regular expression, or the result is undefined.
//
// The algorithm is recursive, but the recursion depth doesn't exceed
// the regex length, so we won't need to worry about running out of
// stack space normally. In rare cases the time complexity can be
// exponential with respect to the regex length + the string length,
// but usually it's must faster (often close to linear).
bool MatchRegexAnywhere(const char* regex, const char* str) {
if (regex == NULL || str == NULL)
return false;
if (*regex == '^')
return MatchRegexAtHead(regex + 1, str);
// A successful match can be anywhere in str.
do {
if (MatchRegexAtHead(regex, str))
return true;
} while (*str++ != '\0');
return false;
}
// Implements the RE class.
RE::~RE() {
free(const_cast<char*>(pattern_));
free(const_cast<char*>(full_pattern_));
}
// Returns true iff regular expression re matches the entire str.
bool RE::FullMatch(const char* str, const RE& re) {
return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str);
}
// Returns true iff regular expression re matches a substring of str
// (including str itself).
bool RE::PartialMatch(const char* str, const RE& re) {
return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str);
}
// Initializes an RE from its string representation.
void RE::Init(const char* regex) {
pattern_ = full_pattern_ = NULL;
if (regex != NULL) {
#ifdef GTEST_OS_WINDOWS
pattern_ = _strdup(regex);
#else
pattern_ = strdup(regex);
#endif
}
is_valid_ = ValidateRegex(regex);
if (!is_valid_) {
// No need to calculate the full pattern when the regex is invalid.
return;
}
const size_t len = strlen(regex);
// Reserves enough bytes to hold the regular expression used for a
// full match: we need space to prepend a '^', append a '$', and
// terminate the string with '\0'.
char* buffer = static_cast<char*>(malloc(len + 3));
full_pattern_ = buffer;
if (*regex != '^')
*buffer++ = '^'; // Makes sure full_pattern_ starts with '^'.
// We don't use snprintf or strncpy, as they trigger a warning when
// compiled with VC++ 8.0.
memcpy(buffer, regex, len);
buffer += len;
if (len == 0 || regex[len - 1] != '$')
*buffer++ = '$'; // Makes sure full_pattern_ ends with '$'.
*buffer = '\0';
}
#endif // GTEST_USES_POSIX_RE
// Logs a message at the given severity level.
void GTestLog(GTestLogSeverity severity, const char* file,