Private members of a regular expression 更多...

Public 成员函数
	Private (const std::string &pat)
	Creates the private part 更多...

void	compile ()
	Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching. 更多...

bool	matchAt (size_t tokenPos, const std::string &str, Match &match, size_t pos, int level) const
	Internal matching routine. 更多...

Public 属性
bool	error = false
	Flag indicating the expression was successfully compiled 更多...

std::vector< PToken >	data
	The token stream representing the compiled regular expression. 更多...

std::string	pattern
	The pattern string as passed by the user 更多...

详细描述

Private members of a regular expression

在文件 regex.cpp 第 169 行定义.

构造及析构函数说明

◆ Private()

reg::Ex::Private::Private ( const std::string & pat )

inline

Creates the private part

在文件 regex.cpp 第 173 行定义.

                                   : pattern(pat)
     {
       data.reserve(100);
     }

引用了 data.

成员函数说明

◆ compile()

void reg::Ex::Private::compile ( )

Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.

在文件 regex.cpp 第 196 行定义.

 {
   error = false;
   data.clear();
   if (pattern.empty()) return;
   const char *start = pattern.c_str();
   const char *ps = start;
   char c;
  
   int prevTokenPos=-1;
   int tokenPos=0;
  
   auto addToken = [&](PToken tok)
   {
     tokenPos++;
     data.emplace_back(tok);
   };
  
   auto getNextCharacter = [&]() -> PToken
   {
     char cs=*ps;
     PToken result = PToken(cs);
     if (cs=='\\') // escaped character
     {
       ps++;
       cs=*ps;
       switch (cs)
       {
         case 'n': result = PToken('\n');                      break;
         case 'r': result = PToken('\r');                      break;
         case 't': result = PToken('\t');                      break;
         case 's': result = PToken(PToken::Kind::WhiteSpace);  break;
         case 'a': result = PToken(PToken::Kind::Alpha);       break;
         case 'w': result = PToken(PToken::Kind::AlphaNum);    break;
         case 'd': result = PToken(PToken::Kind::Digit);       break;
         case '<': result = PToken(PToken::Kind::BeginOfWord); break;
         case '>': result = PToken(PToken::Kind::EndOfWord);   break;
         case 'x':
         case 'X':
           {
             uint16_t v=0;
             for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
             {
               int d = (cs>='a' && cs<='f') ? cs-'a'+10 :
                       (cs>='A' && cs<='F') ? cs-'A'+10 :
                       (cs>='0' && cs<='9') ? cs-'0'    :
                       -1;
               if (d>=0) { v<<=4; v|=d; ps++; } else break;
             }
             result = PToken(v);
           }
           break;
         case '\0': ps--; break; // backslash at the end of the pattern
         default:
           result = PToken(cs);
           break;
       }
     }
     return result;
   };
  
   while ((c=*ps))
   {
     switch (c)
     {
       case '^': // beginning of line (if first character of the pattern)
         prevTokenPos = tokenPos;
         addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
                             PToken(c));
         break;
       case '$': // end of the line (if last character of the pattern)
         prevTokenPos = tokenPos;
         addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
                                 PToken(c));
         break;
       case '.': // any character
         prevTokenPos = tokenPos;
         addToken(PToken(PToken::Kind::Any));
         break;
       case '(': // begin of capture group
         prevTokenPos = tokenPos;
         addToken(PToken(PToken::Kind::BeginCapture));
         break;
       case ')': // end of capture group
         prevTokenPos = tokenPos;
         addToken(PToken(PToken::Kind::EndCapture));
         break;
       case '[': // character class
         {
           prevTokenPos = tokenPos;
           ps++;
           if (*ps==0) { error=true; return; }
           bool esc = *ps=='\\';
           PToken tok = getNextCharacter();
           ps++;
           if (!esc && tok.kind()==PToken::Kind::Character &&
                       tok.asciiValue()=='^') // negated character class
           {
             addToken(PToken(PToken::Kind::NegCharClass));
             if (*ps==0) { error=true; return; }
             tok = getNextCharacter();
             ps++;
           }
           else
           {
             addToken(PToken(PToken::Kind::CharClass));
           }
           uint16_t numTokens=0;
           while ((c=*ps))
           {
             if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
             {
               getNextCharacter();
               ps++;
               PToken endTok = getNextCharacter();
               ps++;
               if (tok.value()>endTok.value())
               {
                 addToken(PToken(endTok.value(),tok.value())); // swap start and end
               }
               else
               {
                 addToken(PToken(tok.value(),endTok.value()));
               }
               numTokens++;
             }
             else // single char, from==to
             {
               if (tok.kind()==PToken::Kind::Character)
               {
                 addToken(PToken(tok.value(),tok.value()));
               }
               else // special token, add as-is since from>to
               {
                 addToken(tok);
               }
               numTokens++;
             }
             if (*ps==0) { error=true; return; } // expected at least a ]
             esc = *ps=='\\';
             tok = getNextCharacter();
             if (!esc && tok.kind()==PToken::Kind::Character &&
                         tok.value()==static_cast<uint16_t>(']'))
             {
               break; // end of character class
             }
             if (*ps==0) { error=true; return; } // no ] found
             ps++;
           }
           // set the value of either NegCharClass or CharClass
           data[prevTokenPos].setValue(numTokens);
         }
         break;
       case '*': // 0 or more
       case '+': // 1 or more
       case '?': // optional: 0 or 1
         {
           if (prevTokenPos==-1)
           {
             error=true;
             return;
           }
           switch (data[prevTokenPos].kind())
           {
             case PToken::Kind::BeginOfLine:  // $*  or  $+ or  $?
             case PToken::Kind::BeginOfWord:  // <* or <+ or <?
             case PToken::Kind::EndOfWord:    // >* or >+ or >?
             case PToken::Kind::Star:         // **  or  *+ or  *?
             case PToken::Kind::Optional:     // ?*  or  ?+ or  ??
               error=true;
               return;
             default: // ok
               break;
           }
           int ddiff = static_cast<int>(tokenPos-prevTokenPos);
           if (*ps=='+') // convert <pat>+ -> <pat><pat>*
           {
             // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
             //                          ddiff=n                                ^prevTokenPos
             data.resize(data.size()+ddiff);
             std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
             prevTokenPos+=ddiff;
             tokenPos+=ddiff;
           }
           data.insert(data.begin()+prevTokenPos,
                       c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star));
           tokenPos++;
           addToken(PToken(PToken::Kind::End));
           // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
           //                            ^prevTokenPos
           // same for 'T?'.
         }
         break;
       default:
         prevTokenPos = tokenPos;
         addToken(getNextCharacter());
         break;
     }
     ps++;
   }
   //addToken(PToken(PToken::Kind::End));
 }

引用了 reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, reg::PToken::CharClass, data, reg::PToken::Digit, reg::PToken::End, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, error, reg::PToken::kind(), reg::PToken::NegCharClass, reg::PToken::Optional, pattern, reg::PToken::Star, reg::PToken::value() , 以及 reg::PToken::WhiteSpace.

◆ matchAt()

bool reg::Ex::Private::matchAt	(	size_t	tokenPos,
		const std::string &	str,
		Match &	match,
		size_t	pos,
		int	level
	)		const

Internal matching routine.

参数

tokenPos	Offset into the token stream.
str	The input string to match against.
match	The object used to store the matching results.
pos	The position in the input string to start with matching
level	Recursion level (used for debugging)

在文件 regex.cpp 第 438 行定义.

 {
   DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,str.c_str(),pos);
   auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
   auto isIdChar      = [](char c) { return isalnum(c) || c=='_'; };
   auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
   {
     PToken tok = data[tp];
     bool negate = tok.kind()==PToken::Kind::NegCharClass;
     uint16_t numFields = tok.value();
     bool found = false;
     for (uint16_t i=0;i<numFields;i++)
     {
       tok = data[++tp];
       // first check for built-in ranges
       if ((tok.kind()==PToken::Kind::Alpha      && isStartIdChar(c)) ||
           (tok.kind()==PToken::Kind::AlphaNum   && isIdChar(c))      ||
           (tok.kind()==PToken::Kind::WhiteSpace && isspace(c))  ||
           (tok.kind()==PToken::Kind::Digit      && isdigit(c))
          )
       {
         found=true;
         break;
       }
       else // user specified range
       {
         uint16_t v = static_cast<uint16_t>(c);
         if (tok.from()<=v && v<=tok.to())
         {
           found=true;
           break;
         }
       }
     }
     DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
     return negate ? !found : found;
   };
   size_t index = pos;
   enum SequenceType { Star, Optional };
   auto processSequence = [this,&tokenPos,&index,&str,&matchCharClass,
                           &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
   {
     size_t startIndex = index;
     PToken tok = data[++tokenPos];
     if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
     {
       char c_tok = tok.asciiValue();
       while (index<=str.length() && str[index]==c_tok) { index++; if (type==Optional) break; }
       tokenPos++;
     }
     else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
     {
       while (index<=str.length() && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
       tokenPos+=tok.value()+1; // skip over character ranges + end token
     }
     else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
     {
       while (index<=str.length() && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
       tokenPos++;
     }
     else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
     {
       while (index<=str.length() && isIdChar(str[index])) { index++; if (type==Optional) break; }
       tokenPos++;
     }
     else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
     {
       while (index<=str.length() && isspace(str[index])) { index++; if (type==Optional) break; }
       tokenPos++;
     }
     else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
     {
       while (index<=str.length() && isdigit(str[index])) { index++; if (type==Optional) break; }
       tokenPos++;
     }
     else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
     {
       if (type==Optional) index++; else index = str.length();
       tokenPos++;
     }
     tokenPos++; // skip over end marker
     while ((int)index>=(int)startIndex)
     {
       // pattern 'x*xy' should match 'xy' and 'xxxxy'
       bool found = matchAt(tokenPos,str,match,index,level+1);
       if (found)
       {
         match.setMatch(pos,index-pos+match.length());
         return true;
       }
       index--;
     }
     return false;
   };
  
   while (tokenPos<data.size())
   {
     PToken tok = data[tokenPos];
     //DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
     if (tok.kind()==PToken::Kind::Character) // match literal character
     {
       char c_tok = tok.asciiValue();
       if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
       index++,tokenPos++;
     }
     else if (tok.isCharClass())
     {
       if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
       index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
     }
     else
     {
       switch (tok.kind())
       {
         case PToken::Kind::Alpha:
           if (index>=str.length() || !isStartIdChar(str[index])) return false;
           index++;
           break;
         case PToken::Kind::AlphaNum:
           if (index>=str.length() || !isIdChar(str[index])) return false;
           index++;
           break;
         case PToken::Kind::WhiteSpace:
           if (index>=str.length() || !isspace(str[index])) return false;
           index++;
           break;
         case PToken::Kind::Digit:
           if (index>=str.length() || !isdigit(str[index])) return false;
           index++;
           break;
         case PToken::Kind::BeginOfLine:
           if (index!=pos) return false;
           break;
         case PToken::Kind::EndOfLine:
           if (index<str.length()) return false;
           break;
         case PToken::Kind::BeginOfWord:
           DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
               index,str[index],isIdChar(str[index]),
               index>0?str[index]-1:0,
               index>0?isIdChar(str[index-1]):-1);
           if (index>=str.length() ||
               !isIdChar(str[index]) ||
               (index>0 && isIdChar(str[index-1]))) return false;
           break;
         case PToken::Kind::EndOfWord:
           DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d  prev.isIsChar(%c)=%d\n",
               index,pos,str[index],isIdChar(str[index]),
               index==0 ? 0 : str[index-1],
               index==0 ? -1 : isIdChar(str[index-1]));
           if (index<str.length() &&
               (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
           break;
         case PToken::Kind::BeginCapture:
           DBG("BeginCapture(%zu)\n",index);
           match.startCapture(index);
           break;
         case PToken::Kind::EndCapture:
           DBG("EndCapture(%zu)\n",index);
           match.endCapture(index);
           break;
         case PToken::Kind::Any:
           if (index>=str.length()) return false;
           index++;
           break;
         case PToken::Kind::Star:
           return processSequence(Star);
         case PToken::Kind::Optional:
           return processSequence(Optional);
         default:
           return false;
       }
       tokenPos++;
     }
   }
   match.setMatch(pos,index-pos);
   return true;
 }

引用了 reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, DBG, reg::PToken::Digit, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, reg::PToken::from(), reg::isalnum(), reg::isalpha(), reg::PToken::isCharClass(), reg::isdigit(), isIdChar, reg::isspace(), reg::PToken::kind(), reg::Ex::match(), reg::PToken::NegCharClass, reg::PToken::Optional, reg::PToken::Star, reg::PToken::to(), reg::PToken::value() , 以及 reg::PToken::WhiteSpace.

类成员变量说明

◆ data

std::vector<PToken> reg::Ex::Private::data

The token stream representing the compiled regular expression.

在文件 regex.cpp 第 187 行定义.

被这些函数引用 compile() , 以及 Private().

◆ error

bool reg::Ex::Private::error = false

Flag indicating the expression was successfully compiled

在文件 regex.cpp 第 184 行定义.

被这些函数引用 compile().

◆ pattern

std::string reg::Ex::Private::pattern

The pattern string as passed by the user

在文件 regex.cpp 第 190 行定义.