Commit b97a1fc9 authored by ahoms's avatar ahoms
Browse files

* added RegEx class with Python-like regex extension for named groups



git-svn-id: https://scm.blissgarden.org/svn/lima/trunk@68 45c4679d-1946-429d-baad-37912b19538b
parent 33103486
......@@ -5,6 +5,7 @@
#include <string>
#include <vector>
#include <map>
#include <sys/types.h>
#include <regex.h>
......@@ -15,8 +16,14 @@ class SimpleRegEx
{
public:
typedef struct SingleMatch {
std::string::const_iterator start;
std::string::const_iterator end;
typedef std::string::const_iterator StrIt;
StrIt start;
StrIt end;
SingleMatch();
SingleMatch(StrIt it, const regmatch_t& rm);
operator bool() const;
} SingleMatchType;
typedef std::vector<SingleMatchType> FullMatchType;
......@@ -27,34 +34,96 @@ class SimpleRegEx
SimpleRegEx(const SimpleRegEx& regex);
~SimpleRegEx();
SimpleRegEx& operator =(const std::string& regex_str);
SimpleRegEx& operator +=(const std::string& regex_str);
SimpleRegEx& operator =(const SimpleRegEx& regex);
SimpleRegEx& operator +=(const SimpleRegEx& regex);
const std::string& getRegExStr() const;
bool singleSearch(const std::string& str, FullMatchType& match,
int nb_groups = 0, int match_idx = 0);
int nb_groups = 0, int match_idx = 0) const;
void multiSearch(const std::string& str, MatchListType& match_list,
int nb_groups = 0, int max_nb_match = 0);
int nb_groups = 0, int max_nb_match = 0) const;
bool match(const std::string& str, FullMatchType& match,
int nb_groups = 0);
int nb_groups = 0) const;
int getNbGroups() const;
private:
void set(const std::string& regex_str);
void free();
void throwError(int ret, std::string file, std::string func, int line);
static int findNbGroups(const std::string& regex_str);
void throwError(int ret, std::string file, std::string func,
int line) const;
std::string m_str;
regex_t m_regex;
int m_nb_groups;
};
inline SimpleRegEx operator +(const SimpleRegEx& re1, const SimpleRegEx& re2)
{
SimpleRegEx re = re1;
return re += re2;
}
SimpleRegEx operator +(const SimpleRegEx& re1, const SimpleRegEx& re2);
class RegEx {
public:
typedef SimpleRegEx::SingleMatchType SingleMatchType;
typedef SimpleRegEx::FullMatchType FullMatchType;
typedef SimpleRegEx::MatchListType MatchListType;
typedef std::map<std::string, SingleMatchType> FullNameMatchType;
typedef std::vector<FullNameMatchType> NameMatchListType;
RegEx();
RegEx(const std::string& regex_str);
RegEx(const RegEx& regex);
~RegEx();
RegEx& operator =(const std::string& regex_str);
RegEx& operator +=(const std::string& regex_str);
RegEx& operator =(const RegEx& regex);
RegEx& operator +=(const RegEx& regex);
const std::string& getRegExStr() const;
bool singleSearch(const std::string& str,
FullMatchType& match,
int nb_groups = 0, int match_idx = 0) const;
void multiSearch(const std::string& str,
MatchListType& match_list,
int nb_groups = 0, int max_nb_match = 0) const;
bool match(const std::string& str,
FullMatchType& match,
int nb_groups = 0) const;
bool singleSearchName(const std::string& str,
FullNameMatchType& name_match,
int match_idx = 0) const;
void multiSearchName(const std::string& str,
NameMatchListType& name_match_list,
int max_nb_match = 0) const;
bool matchName(const std::string& str,
FullNameMatchType& name_match) const;
int getNbGroups() const;
int getNbNameGroups() const;
private:
typedef std::map<std::string, int> NameMapType;
void set(const std::string& regex_str);
void free();
void convertNameMatch(const FullMatchType& match,
FullNameMatchType& name_match) const;
std::string m_str;
SimpleRegEx m_regex;
NameMapType m_name_map;
};
RegEx operator +(const RegEx& re1, const RegEx& re2);
} // namespace lima
......
......@@ -11,6 +11,26 @@ using namespace std;
__LINE__); \
}
SimpleRegEx::SingleMatch::SingleMatch()
{
start = end;
}
SimpleRegEx::SingleMatch::SingleMatch(StrIt it, const regmatch_t& rm)
{
if ((rm.rm_so != -1) && (rm.rm_eo != -1)) {
start = it + rm.rm_so;
end = it + rm.rm_eo;
} else
start = end = it;
}
SimpleRegEx::SingleMatch::operator bool() const
{
return start != end;
}
SimpleRegEx::SimpleRegEx()
{
set("");
......@@ -33,6 +53,18 @@ SimpleRegEx::~SimpleRegEx()
free();
}
SimpleRegEx& SimpleRegEx::operator =(const string& regex_str)
{
set(regex_str);
return *this;
}
SimpleRegEx& SimpleRegEx::operator +=(const string& regex_str)
{
set(m_str + regex_str);
return *this;
}
SimpleRegEx& SimpleRegEx::operator =(const SimpleRegEx& regex)
{
set(regex.m_str);
......@@ -41,8 +73,7 @@ SimpleRegEx& SimpleRegEx::operator =(const SimpleRegEx& regex)
SimpleRegEx& SimpleRegEx::operator +=(const SimpleRegEx& regex)
{
string regex_str = m_str + regex.m_str;
set(regex_str);
set(m_str + regex.m_str);
return *this;
}
......@@ -55,7 +86,30 @@ void SimpleRegEx::set(const string& regex_str)
if (!regex_str.empty())
CHECK_CALL(regcomp(&m_regex, regex_str.c_str(), REG_EXTENDED));
m_str = regex_str;
m_nb_groups = findNbGroups(regex_str);
}
int SimpleRegEx::findNbGroups(const string& regex_str)
{
int nb_groups = 0;
string::const_iterator it, end = regex_str.end();
bool in_escape = false;
for (it = regex_str.begin(); it != end; ++it) {
switch (*it) {
case '\\':
in_escape = !in_escape;
break;
case '(':
if (!in_escape)
nb_groups++;
default:
in_escape = false;
}
}
return nb_groups;
}
void SimpleRegEx::free()
......@@ -73,8 +127,13 @@ const string& SimpleRegEx::getRegExStr() const
return m_str;
}
int SimpleRegEx::getNbGroups() const
{
return m_nb_groups;
}
bool SimpleRegEx::singleSearch(const string& str, FullMatchType& match,
int nb_groups, int match_idx)
int nb_groups, int match_idx) const
{
if (match_idx < 0)
throw LIMA_COM_EXC(InvalidValue, "Invalid match index");
......@@ -89,7 +148,7 @@ bool SimpleRegEx::singleSearch(const string& str, FullMatchType& match,
}
void SimpleRegEx::multiSearch(const string& str, MatchListType& match_list,
int nb_groups, int max_nb_match)
int nb_groups, int max_nb_match) const
{
if (m_str.empty())
throw LIMA_COM_EXC(InvalidValue, "Regular expression not set");
......@@ -97,48 +156,40 @@ void SimpleRegEx::multiSearch(const string& str, MatchListType& match_list,
match_list.clear();
typedef string::const_iterator StrIt;
StrIt sbegin = str.begin();
StrIt send = str.end();
StrIt sbeg = str.begin();
StrIt send = str.end();
nb_groups = (nb_groups > 0) ? nb_groups : (m_nb_groups + 1);
if (nb_groups == 0)
nb_groups = 255;
regmatch_t reg_match[nb_groups];
regmatch_t *mend = reg_match + nb_groups;
StrIt it = sbegin;
StrIt it = sbeg;
for (int i = 0; it != send; i++) {
if ((max_nb_match > 0) && (i == max_nb_match))
break;
string aux(it, send);
int flags = (it != sbegin) ? REG_NOTBOL : 0;
int flags = (it != sbeg) ? REG_NOTBOL : 0;
int ret = regexec(&m_regex, aux.c_str(), nb_groups, reg_match,
flags);
if (ret == REG_NOMATCH)
break;
CHECK_CALL(ret);
StrIt match_end = send;
FullMatchType full_match;
for (regmatch_t *m = reg_match; m != mend; ++m) {
if (m->rm_so == -1)
break;
SingleMatchType match;
match.start = it + m->rm_so;
match.end = it + m->rm_eo;
SingleMatchType match(it, *m);
full_match.push_back(match);
match_end = match.end;
}
match_list.push_back(full_match);
it = match_end;
it += reg_match[0].rm_eo;
}
}
bool SimpleRegEx::match(const string& str, FullMatchType& match,
int nb_groups)
int nb_groups) const
{
if (!singleSearch(str, match, nb_groups))
return false;
......@@ -146,7 +197,7 @@ bool SimpleRegEx::match(const string& str, FullMatchType& match,
return (match[0].start == str.begin());
}
void SimpleRegEx::throwError(int ret, string file, string func, int line)
void SimpleRegEx::throwError(int ret, string file, string func, int line) const
{
size_t len = regerror(ret, &m_regex, NULL, 0);
string regerr(len, '\0');
......@@ -156,5 +207,213 @@ void SimpleRegEx::throwError(int ret, string file, string func, int line)
throw Exception(Common, Error, err_desc, file, func, line);
}
SimpleRegEx lima::operator +(const SimpleRegEx& re1, const SimpleRegEx& re2)
{
SimpleRegEx re = re1;
return re += re2;
}
RegEx::RegEx()
{
set("");
}
RegEx::RegEx(const string& regex_str)
{
set(regex_str);
}
RegEx::RegEx(const RegEx& regex)
{
set(regex.m_str);
}
RegEx::~RegEx()
{
free();
}
RegEx& RegEx::operator =(const string& regex_str)
{
set(regex_str);
return *this;
}
RegEx& RegEx::operator +=(const string& regex_str)
{
set(m_str + regex_str);
return *this;
}
RegEx& RegEx::operator =(const RegEx& regex)
{
set(regex.m_str);
return *this;
}
RegEx& RegEx::operator +=(const RegEx& regex)
{
set(m_str + regex.m_str);
return *this;
}
void RegEx::free()
{
m_name_map.clear();
m_regex = "";
m_str.clear();
}
void RegEx::set(const string& regex_str)
{
if (regex_str == m_str)
return;
free();
string re = "([^(])?(\\()(\\?P<([A-Za-z][A-Za-z0-9_]*)>)?[^\\(]*";
SimpleRegEx grp_start_re(re);
typedef SimpleRegEx::SingleMatchType SingleMatchType;
typedef SimpleRegEx::FullMatchType FullMatchType;
typedef SimpleRegEx::MatchListType MatchListType;
MatchListType grp_list;
grp_start_re.multiSearch(regex_str, grp_list);
string::const_iterator sit = regex_str.begin();
int grp_nb = 0;
string simple_regex_str;
MatchListType::const_iterator mit, mend = grp_list.end();
for (mit = grp_list.begin(); mit != mend; ++mit) {
const FullMatchType& fm = *mit;
const SingleMatchType& grp_start = fm[0];
const SingleMatchType& pre_grp_chr = fm[1];
const SingleMatchType& grp_open = fm[2];
const SingleMatchType& grp_ext = fm[3];
const SingleMatchType& grp_name = fm[4];
simple_regex_str += string(sit, grp_open.end);
sit = grp_open.end;
bool is_grp = (!pre_grp_chr || (*pre_grp_chr.start != '\\'));
if (is_grp)
grp_nb++;
if (is_grp && grp_ext) {
string name(grp_name.start, grp_name.end);
m_name_map[name] = grp_nb;
sit = grp_ext.end;
}
simple_regex_str += string(sit, grp_start.end);
sit = grp_start.end;
}
simple_regex_str += string(sit, regex_str.end());
m_str = regex_str;
m_regex = simple_regex_str;
if (m_regex.getNbGroups() != grp_nb)
throw LIMA_COM_EXC(Error, "RegEx nb of groups mismatch");
}
const string& RegEx::getRegExStr() const
{
return m_str;
}
int RegEx::getNbGroups() const
{
return m_regex.getNbGroups();
}
int RegEx::getNbNameGroups() const
{
return m_name_map.size();
}
bool RegEx::singleSearch(const string& str, FullMatchType& match,
int nb_groups, int match_idx) const
{
return m_regex.singleSearch(str, match, nb_groups, match_idx);
}
bool RegEx::singleSearchName(const string& str,
FullNameMatchType& name_match,
int match_idx) const
{
name_match.clear();
FullMatchType full_match;
if (!singleSearch(str, full_match, 0, match_idx))
return false;
convertNameMatch(full_match, name_match);
return true;
}
void RegEx::convertNameMatch(const FullMatchType& full_match,
FullNameMatchType& name_match) const
{
NameMapType::const_iterator it, end = m_name_map.end();
for (it = m_name_map.begin(); it != end; ++it) {
const string& name = it->first;
int group_nb = it->second;
const SingleMatchType& match = full_match[group_nb];
name_match[name] = match;
}
}
void RegEx::multiSearch(const string& str, MatchListType& match_list,
int nb_groups, int max_nb_match) const
{
m_regex.multiSearch(str, match_list, nb_groups, max_nb_match);
}
void RegEx::multiSearchName(const string& str,
NameMatchListType& name_match_list,
int max_nb_match) const
{
name_match_list.clear();
MatchListType match_list;
multiSearch(str, match_list, 0, max_nb_match);
MatchListType::const_iterator it, end = match_list.end();
for (it = match_list.begin(); it != end; ++it) {
FullNameMatchType name_match;
convertNameMatch(*it, name_match);
name_match_list.push_back(name_match);
}
}
bool RegEx::match(const string& str, FullMatchType& match,
int nb_groups) const
{
return m_regex.match(str, match, nb_groups);
}
bool RegEx::matchName(const string& str,
FullNameMatchType& name_match) const
{
name_match.clear();
FullMatchType full_match;
if (!match(str, full_match, 0))
return false;
convertNameMatch(full_match, name_match);
return true;
}
RegEx lima::operator +(const RegEx& re1, const RegEx& re2)
{
RegEx re = re1;
return re += re2;
}
......@@ -5,20 +5,18 @@
using namespace lima;
using namespace std;
void test_simple_regex(const string& re_str, const string& s)
{
SimpleRegEx re(re_str);
typedef RegEx::SingleMatchType SingleMatchType;
typedef RegEx::FullMatchType FullMatchType;
typedef RegEx::MatchListType MatchListType;
typedef RegEx::FullNameMatchType FullNameMatchType;
typedef RegEx::NameMatchListType NameMatchListType;
cout << "re=\"" << re.getRegExStr() << "\"" << endl;
void print_match_list(const string& re, int nb_groups, const string& s,
const MatchListType& match_list)
{
cout << "re(" << nb_groups << ")=\"" << re << "\"" << endl;
cout << "s=\"" << s << "\"" << endl;
typedef SimpleRegEx::SingleMatchType SingleMatchType;
typedef SimpleRegEx::FullMatchType FullMatchType;
typedef SimpleRegEx::MatchListType MatchListType;
MatchListType match_list;
re.multiSearch(s, match_list);
MatchListType::const_iterator iti = match_list.begin();
string::const_iterator b = s.begin();
for (int i = 0; iti != match_list.end(); ++i, ++iti) {
......@@ -26,8 +24,41 @@ void test_simple_regex(const string& re_str, const string& s)
FullMatchType::const_iterator itj = fmatch.begin();
for (int j = 0; itj != fmatch.end(); ++j, ++itj) {
const SingleMatchType& smatch = *itj;
cout << i << "-" << j << ": "
<< smatch.start - b << "-" << smatch.end - b
cout << i << "-" << j << ": ";
if (!smatch) {
cout << "No match" << endl;
continue;
}
cout << smatch.start - b << "-" << smatch.end - b
<< ": " << string(smatch.start, smatch.end)
<< endl;
}
}
cout << endl;
}
void print_name_match_list(const string& re, int nb_name_groups,
const string& s,
const NameMatchListType& name_match_list)
{
cout << "re(" << nb_name_groups << ")=\"" << re << "\"" << endl;
cout << "s=\"" << s << "\"" << endl;
NameMatchListType::const_iterator iti = name_match_list.begin();
string::const_iterator b = s.begin();
for (int i = 0; iti != name_match_list.end(); ++i, ++iti) {
const FullNameMatchType& fmatch = *iti;
FullNameMatchType::const_iterator itj = fmatch.begin();
for (int j = 0; itj != fmatch.end(); ++j, ++itj) {
const string& name = itj->first;
const SingleMatchType& smatch = itj->second;
cout << i << "-\"" << name << "\": ";
if (!smatch) {
cout << "No match" << endl;
continue;
}
cout << smatch.start - b << "-" << smatch.end - b
<< ": " << string(smatch.start, smatch.end)
<< endl;
}
......@@ -35,12 +66,42 @@ void test_simple_regex(const string& re_str, const string& s)
cout << endl;
}
void test_simple_regex(const string& re_str, const string& s)
{
SimpleRegEx re(re_str);
MatchListType match_list;
re.multiSearch(s, match_list);
print_match_list(re_str, re.getNbGroups(), s, match_list);
}
void test_regex(const string& re_str, const string& s)
{
RegEx re(re_str);
MatchListType match_list;
re.multiSearch(s, match_list);
print_match_list(re_str, re.getNbGroups(), s, match_list);
NameMatchListType name_match_list;
re.multiSearchName(s, name_match_list);
print_name_match_list(re_str, re.getNbNameGroups(), s,
name_match_list);