Doxygen
reg::Ex::Private类 参考

Private members of a regular expression 更多...

Public 成员函数

 Private (const std::string &pat)
 Creates the private part 更多...
 
void compile ()
 Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching. 更多...
 
bool matchAt (size_t tokenPos, const std::string &str, Match &match, size_t pos, int level) const
 Internal matching routine. 更多...
 

Public 属性

bool error = false
 Flag indicating the expression was successfully compiled 更多...
 
std::vector< PTokendata
 The token stream representing the compiled regular expression. 更多...
 
std::string pattern
 The pattern string as passed by the user 更多...
 

详细描述

Private members of a regular expression

在文件 regex.cpp169 行定义.

构造及析构函数说明

◆ Private()

reg::Ex::Private::Private ( const std::string &  pat)
inline

Creates the private part

在文件 regex.cpp173 行定义.

173  : pattern(pat)
174  {
175  data.reserve(100);
176  }

引用了 data.

成员函数说明

◆ compile()

void reg::Ex::Private::compile ( )

Compiles a regular expression passed as a string into a stream of tokens that can be used for efficient searching.

在文件 regex.cpp196 行定义.

197 {
198  error = false;
199  data.clear();
200  if (pattern.empty()) return;
201  const char *start = pattern.c_str();
202  const char *ps = start;
203  char c;
204 
205  int prevTokenPos=-1;
206  int tokenPos=0;
207 
208  auto addToken = [&](PToken tok)
209  {
210  tokenPos++;
211  data.emplace_back(tok);
212  };
213 
214  auto getNextCharacter = [&]() -> PToken
215  {
216  char cs=*ps;
217  PToken result = PToken(cs);
218  if (cs=='\\') // escaped character
219  {
220  ps++;
221  cs=*ps;
222  switch (cs)
223  {
224  case 'n': result = PToken('\n'); break;
225  case 'r': result = PToken('\r'); break;
226  case 't': result = PToken('\t'); break;
227  case 's': result = PToken(PToken::Kind::WhiteSpace); break;
228  case 'a': result = PToken(PToken::Kind::Alpha); break;
229  case 'w': result = PToken(PToken::Kind::AlphaNum); break;
230  case 'd': result = PToken(PToken::Kind::Digit); break;
231  case '<': result = PToken(PToken::Kind::BeginOfWord); break;
232  case '>': result = PToken(PToken::Kind::EndOfWord); break;
233  case 'x':
234  case 'X':
235  {
236  uint16_t v=0;
237  for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
238  {
239  int d = (cs>='a' && cs<='f') ? cs-'a'+10 :
240  (cs>='A' && cs<='F') ? cs-'A'+10 :
241  (cs>='0' && cs<='9') ? cs-'0' :
242  -1;
243  if (d>=0) { v<<=4; v|=d; ps++; } else break;
244  }
245  result = PToken(v);
246  }
247  break;
248  case '\0': ps--; break; // backslash at the end of the pattern
249  default:
250  result = PToken(cs);
251  break;
252  }
253  }
254  return result;
255  };
256 
257  while ((c=*ps))
258  {
259  switch (c)
260  {
261  case '^': // beginning of line (if first character of the pattern)
262  prevTokenPos = tokenPos;
263  addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
264  PToken(c));
265  break;
266  case '$': // end of the line (if last character of the pattern)
267  prevTokenPos = tokenPos;
268  addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
269  PToken(c));
270  break;
271  case '.': // any character
272  prevTokenPos = tokenPos;
273  addToken(PToken(PToken::Kind::Any));
274  break;
275  case '(': // begin of capture group
276  prevTokenPos = tokenPos;
277  addToken(PToken(PToken::Kind::BeginCapture));
278  break;
279  case ')': // end of capture group
280  prevTokenPos = tokenPos;
281  addToken(PToken(PToken::Kind::EndCapture));
282  break;
283  case '[': // character class
284  {
285  prevTokenPos = tokenPos;
286  ps++;
287  if (*ps==0) { error=true; return; }
288  bool esc = *ps=='\\';
289  PToken tok = getNextCharacter();
290  ps++;
291  if (!esc && tok.kind()==PToken::Kind::Character &&
292  tok.asciiValue()=='^') // negated character class
293  {
294  addToken(PToken(PToken::Kind::NegCharClass));
295  if (*ps==0) { error=true; return; }
296  tok = getNextCharacter();
297  ps++;
298  }
299  else
300  {
301  addToken(PToken(PToken::Kind::CharClass));
302  }
303  uint16_t numTokens=0;
304  while ((c=*ps))
305  {
306  if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
307  {
308  getNextCharacter();
309  ps++;
310  PToken endTok = getNextCharacter();
311  ps++;
312  if (tok.value()>endTok.value())
313  {
314  addToken(PToken(endTok.value(),tok.value())); // swap start and end
315  }
316  else
317  {
318  addToken(PToken(tok.value(),endTok.value()));
319  }
320  numTokens++;
321  }
322  else // single char, from==to
323  {
324  if (tok.kind()==PToken::Kind::Character)
325  {
326  addToken(PToken(tok.value(),tok.value()));
327  }
328  else // special token, add as-is since from>to
329  {
330  addToken(tok);
331  }
332  numTokens++;
333  }
334  if (*ps==0) { error=true; return; } // expected at least a ]
335  esc = *ps=='\\';
336  tok = getNextCharacter();
337  if (!esc && tok.kind()==PToken::Kind::Character &&
338  tok.value()==static_cast<uint16_t>(']'))
339  {
340  break; // end of character class
341  }
342  if (*ps==0) { error=true; return; } // no ] found
343  ps++;
344  }
345  // set the value of either NegCharClass or CharClass
346  data[prevTokenPos].setValue(numTokens);
347  }
348  break;
349  case '*': // 0 or more
350  case '+': // 1 or more
351  case '?': // optional: 0 or 1
352  {
353  if (prevTokenPos==-1)
354  {
355  error=true;
356  return;
357  }
358  switch (data[prevTokenPos].kind())
359  {
360  case PToken::Kind::BeginOfLine: // $* or $+ or $?
361  case PToken::Kind::BeginOfWord: // <* or <+ or <?
362  case PToken::Kind::EndOfWord: // >* or >+ or >?
363  case PToken::Kind::Star: // ** or *+ or *?
364  case PToken::Kind::Optional: // ?* or ?+ or ??
365  error=true;
366  return;
367  default: // ok
368  break;
369  }
370  int ddiff = static_cast<int>(tokenPos-prevTokenPos);
371  if (*ps=='+') // convert <pat>+ -> <pat><pat>*
372  {
373  // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
374  // ddiff=n ^prevTokenPos
375  data.resize(data.size()+ddiff);
376  std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
377  prevTokenPos+=ddiff;
378  tokenPos+=ddiff;
379  }
380  data.insert(data.begin()+prevTokenPos,
381  c=='?' ? PToken(PToken::Kind::Optional) : PToken(PToken::Kind::Star));
382  tokenPos++;
383  addToken(PToken(PToken::Kind::End));
384  // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
385  // ^prevTokenPos
386  // same for 'T?'.
387  }
388  break;
389  default:
390  prevTokenPos = tokenPos;
391  addToken(getNextCharacter());
392  break;
393  }
394  ps++;
395  }
396  //addToken(PToken(PToken::Kind::End));
397 }

引用了 reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, reg::PToken::CharClass, data, reg::PToken::Digit, reg::PToken::End, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, error, reg::PToken::kind(), reg::PToken::NegCharClass, reg::PToken::Optional, pattern, reg::PToken::Star, reg::PToken::value() , 以及 reg::PToken::WhiteSpace.

◆ matchAt()

bool reg::Ex::Private::matchAt ( size_t  tokenPos,
const std::string &  str,
Match match,
size_t  pos,
int  level 
) const

Internal matching routine.

参数
tokenPosOffset into the token stream.
strThe input string to match against.
matchThe object used to store the matching results.
posThe position in the input string to start with matching
levelRecursion level (used for debugging)

在文件 regex.cpp438 行定义.

439 {
440  DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,str.c_str(),pos);
441  auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
442  auto isIdChar = [](char c) { return isalnum(c) || c=='_'; };
443  auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
444  {
445  PToken tok = data[tp];
446  bool negate = tok.kind()==PToken::Kind::NegCharClass;
447  uint16_t numFields = tok.value();
448  bool found = false;
449  for (uint16_t i=0;i<numFields;i++)
450  {
451  tok = data[++tp];
452  // first check for built-in ranges
453  if ((tok.kind()==PToken::Kind::Alpha && isStartIdChar(c)) ||
454  (tok.kind()==PToken::Kind::AlphaNum && isIdChar(c)) ||
455  (tok.kind()==PToken::Kind::WhiteSpace && isspace(c)) ||
456  (tok.kind()==PToken::Kind::Digit && isdigit(c))
457  )
458  {
459  found=true;
460  break;
461  }
462  else // user specified range
463  {
464  uint16_t v = static_cast<uint16_t>(c);
465  if (tok.from()<=v && v<=tok.to())
466  {
467  found=true;
468  break;
469  }
470  }
471  }
472  DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
473  return negate ? !found : found;
474  };
475  size_t index = pos;
476  enum SequenceType { Star, Optional };
477  auto processSequence = [this,&tokenPos,&index,&str,&matchCharClass,
478  &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
479  {
480  size_t startIndex = index;
481  PToken tok = data[++tokenPos];
482  if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
483  {
484  char c_tok = tok.asciiValue();
485  while (index<=str.length() && str[index]==c_tok) { index++; if (type==Optional) break; }
486  tokenPos++;
487  }
488  else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
489  {
490  while (index<=str.length() && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
491  tokenPos+=tok.value()+1; // skip over character ranges + end token
492  }
493  else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
494  {
495  while (index<=str.length() && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
496  tokenPos++;
497  }
498  else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
499  {
500  while (index<=str.length() && isIdChar(str[index])) { index++; if (type==Optional) break; }
501  tokenPos++;
502  }
503  else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
504  {
505  while (index<=str.length() && isspace(str[index])) { index++; if (type==Optional) break; }
506  tokenPos++;
507  }
508  else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
509  {
510  while (index<=str.length() && isdigit(str[index])) { index++; if (type==Optional) break; }
511  tokenPos++;
512  }
513  else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
514  {
515  if (type==Optional) index++; else index = str.length();
516  tokenPos++;
517  }
518  tokenPos++; // skip over end marker
519  while ((int)index>=(int)startIndex)
520  {
521  // pattern 'x*xy' should match 'xy' and 'xxxxy'
522  bool found = matchAt(tokenPos,str,match,index,level+1);
523  if (found)
524  {
525  match.setMatch(pos,index-pos+match.length());
526  return true;
527  }
528  index--;
529  }
530  return false;
531  };
532 
533  while (tokenPos<data.size())
534  {
535  PToken tok = data[tokenPos];
536  //DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
537  if (tok.kind()==PToken::Kind::Character) // match literal character
538  {
539  char c_tok = tok.asciiValue();
540  if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
541  index++,tokenPos++;
542  }
543  else if (tok.isCharClass())
544  {
545  if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
546  index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
547  }
548  else
549  {
550  switch (tok.kind())
551  {
552  case PToken::Kind::Alpha:
553  if (index>=str.length() || !isStartIdChar(str[index])) return false;
554  index++;
555  break;
557  if (index>=str.length() || !isIdChar(str[index])) return false;
558  index++;
559  break;
561  if (index>=str.length() || !isspace(str[index])) return false;
562  index++;
563  break;
564  case PToken::Kind::Digit:
565  if (index>=str.length() || !isdigit(str[index])) return false;
566  index++;
567  break;
569  if (index!=pos) return false;
570  break;
572  if (index<str.length()) return false;
573  break;
575  DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
576  index,str[index],isIdChar(str[index]),
577  index>0?str[index]-1:0,
578  index>0?isIdChar(str[index-1]):-1);
579  if (index>=str.length() ||
580  !isIdChar(str[index]) ||
581  (index>0 && isIdChar(str[index-1]))) return false;
582  break;
584  DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
585  index,pos,str[index],isIdChar(str[index]),
586  index==0 ? 0 : str[index-1],
587  index==0 ? -1 : isIdChar(str[index-1]));
588  if (index<str.length() &&
589  (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
590  break;
592  DBG("BeginCapture(%zu)\n",index);
593  match.startCapture(index);
594  break;
596  DBG("EndCapture(%zu)\n",index);
597  match.endCapture(index);
598  break;
599  case PToken::Kind::Any:
600  if (index>=str.length()) return false;
601  index++;
602  break;
603  case PToken::Kind::Star:
604  return processSequence(Star);
606  return processSequence(Optional);
607  default:
608  return false;
609  }
610  tokenPos++;
611  }
612  }
613  match.setMatch(pos,index-pos);
614  return true;
615 }

引用了 reg::PToken::Alpha, reg::PToken::AlphaNum, reg::PToken::Any, reg::PToken::asciiValue(), reg::PToken::BeginCapture, reg::PToken::BeginOfLine, reg::PToken::BeginOfWord, reg::PToken::Character, DBG, reg::PToken::Digit, reg::PToken::EndCapture, reg::PToken::EndOfLine, reg::PToken::EndOfWord, reg::PToken::from(), reg::isalnum(), reg::isalpha(), reg::PToken::isCharClass(), reg::isdigit(), isIdChar, reg::isspace(), reg::PToken::kind(), reg::Ex::match(), reg::PToken::NegCharClass, reg::PToken::Optional, reg::PToken::Star, reg::PToken::to(), reg::PToken::value() , 以及 reg::PToken::WhiteSpace.

类成员变量说明

◆ data

std::vector<PToken> reg::Ex::Private::data

The token stream representing the compiled regular expression.

在文件 regex.cpp187 行定义.

被这些函数引用 compile() , 以及 Private().

◆ error

bool reg::Ex::Private::error = false

Flag indicating the expression was successfully compiled

在文件 regex.cpp184 行定义.

被这些函数引用 compile().

◆ pattern

std::string reg::Ex::Private::pattern

The pattern string as passed by the user

在文件 regex.cpp190 行定义.

被这些函数引用 compile().


该类的文档由以下文件生成:
reg::Ex::Private::data
std::vector< PToken > data
The token stream representing the compiled regular expression.
Definition: regex.cpp:187
reg::isalpha
static bool isalpha(char c)
Definition: regex.cpp:38
reg::PToken::Kind::Alpha
@ Alpha
reg::Ex::Private::pattern
std::string pattern
The pattern string as passed by the user
Definition: regex.cpp:190
reg::PToken::Kind::Any
@ Any
reg::PToken::Kind::End
@ End
DBG
#define DBG(fmt,...)
Definition: regex.cpp:27
reg::PToken::Kind::Character
@ Character
reg::PToken::Kind::EndOfWord
@ EndOfWord
reg::isdigit
static bool isdigit(char c)
Definition: regex.cpp:43
reg::isspace
static bool isspace(char c)
Definition: regex.cpp:33
reg::PToken::Kind::EndOfLine
@ EndOfLine
reg::Ex::match
bool match(const std::string &str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition: regex.cpp:676
reg::Ex::Private::error
bool error
Flag indicating the expression was successfully compiled
Definition: regex.cpp:184
reg::isalnum
static bool isalnum(char c)
Definition: regex.cpp:48
reg::PToken::Kind::Digit
@ Digit
reg::PToken::Kind::CharClass
@ CharClass
reg::PToken::Kind::AlphaNum
@ AlphaNum
reg::PToken::Kind::Optional
@ Optional
reg::PToken::Kind::BeginOfWord
@ BeginOfWord
reg::PToken::Kind::WhiteSpace
@ WhiteSpace
reg::Ex::Private::matchAt
bool matchAt(size_t tokenPos, const std::string &str, Match &match, size_t pos, int level) const
Internal matching routine.
Definition: regex.cpp:438
reg::PToken::Kind::BeginOfLine
@ BeginOfLine
reg::PToken::Kind::Star
@ Star
reg::PToken::Kind::BeginCapture
@ BeginCapture
reg::PToken::Kind::EndCapture
@ EndCapture
isIdChar
#define isIdChar(i)
Definition: markdown.cpp:174
reg::PToken::Kind::NegCharClass
@ NegCharClass