Doxygen
regex.cpp
浏览该文件的文档.
1 /******************************************************************************
2  *
3  * Copyright (C) 1997-2021 by Dimitri van Heesch.
4  *
5  * Permission to use, copy, modify, and distribute this software and its
6  * documentation under the terms of the GNU General Public License is hereby
7  * granted. No representations are made about the suitability of this software
8  * for any purpose. It is provided "as is" without express or implied warranty.
9  * See the GNU General Public License for more details.
10  *
11  * Documents produced by Doxygen are derivative works derived from the
12  * input used in their production; they are not affected by this license.
13  *
14  */
15 
16 #include "regex.h"
17 #include <cstdint>
18 #include <vector>
19 #include <cctype>
20 #include <cassert>
21 #include <algorithm>
22 
23 #define ENABLE_DEBUG 0
24 #if ENABLE_DEBUG
25 #define DBG(fmt,...) do { fprintf(stderr,fmt,__VA_ARGS__); } while(0)
26 #else
27 #define DBG(fmt,...) do {} while(0)
28 #endif
29 
30 namespace reg
31 {
32 
33 static inline bool isspace(char c)
34 {
35  return c==' ' || c=='\t' || c=='\n' || c=='\r';
36 }
37 
38 static inline bool isalpha(char c)
39 {
40  return static_cast<unsigned char>(c)>=128 || (c>='a' && c<='z') || (c>='A' && c<='Z');
41 }
42 
43 static inline bool isdigit(char c)
44 {
45  return c>='0' && c<='9';
46 }
47 
48 static inline bool isalnum(char c)
49 {
50  return isalpha(c) || isdigit(c);
51 }
52 
53 
54 /** Class representing a token in the compiled regular expression token stream.
55  * A token has a kind and an optional value whose meaning depends on the kind.
56  * It is also possible to store a (from,to) character range in a token.
57  */
58 class PToken
59 {
60  public:
61  /** The kind of token.
62  *
63  * Ranges per bit mask:
64  * - `0x00FF` from part of a range, except for `0x0000` which is the End marker
65  * - `0x1FFF` built-in ranges
66  * - `0x2FFF` user defined ranges
67  * - `0x4FFF` special operations
68  * - `0x8000` literal character
69  */
70  enum class Kind : uint16_t
71  {
72  End = 0x0000,
73  WhiteSpace = 0x1001, // \s range [ \t\r\n]
74  Digit = 0x1002, // \d range [0-9]
75  Alpha = 0x1003, // \a range [a-z_A-Z\x80-\xFF]
76  AlphaNum = 0x1004, // \w range [a-Z_A-Z0-9\x80-\xFF]
77  CharClass = 0x2001, // []
78  NegCharClass = 0x2002, // [^]
79  BeginOfLine = 0x4001, // ^
80  EndOfLine = 0x4002, // $
81  BeginOfWord = 0x4003, // <
82  EndOfWord = 0x4004, // >
83  BeginCapture = 0x4005, // (
84  EndCapture = 0x4006, // )
85  Any = 0x4007, // .
86  Star = 0x4008, // *
87  Optional = 0x4009, // ?
88  Character = 0x8000 // c
89  };
90 
91  /** returns a string representation of the tokens kind (useful for debugging). */
92  const char *kindStr() const
93  {
94  if ((m_rep>>16)>=0x1000 || m_rep==0)
95  {
96  switch(static_cast<Kind>((m_rep>>16)))
97  {
98  case Kind::End: return "End";
99  case Kind::Alpha: return "Alpha";
100  case Kind::AlphaNum: return "AlphaNum";
101  case Kind::WhiteSpace: return "WhiteSpace";
102  case Kind::Digit: return "Digit";
103  case Kind::CharClass: return "CharClass";
104  case Kind::NegCharClass: return "NegCharClass";
105  case Kind::Character: return "Character";
106  case Kind::BeginOfLine: return "BeginOfLine";
107  case Kind::EndOfLine: return "EndOfLine";
108  case Kind::BeginOfWord: return "BeginOfWord";
109  case Kind::EndOfWord: return "EndOfWord";
110  case Kind::BeginCapture: return "BeginCapture";
111  case Kind::EndCapture: return "EndCapture";
112  case Kind::Any: return "Any";
113  case Kind::Star: return "Star";
114  case Kind::Optional: return "Optional";
115  }
116  }
117  else
118  {
119  return "Range";
120  }
121  }
122 
123  /** Creates a token of kind 'End' */
124  PToken() : m_rep(0) {}
125 
126  /** Creates a token of the given kind \a k */
127  explicit PToken(Kind k) : m_rep(static_cast<uint32_t>(k)<<16) {}
128 
129  /** Create a token for an ASCII character */
130  PToken(char c) : m_rep((static_cast<uint32_t>(Kind::Character)<<16) |
131  static_cast<uint32_t>(c)) {}
132 
133  /** Create a token for a byte of an UTF-8 character */
134  PToken(uint16_t v) : m_rep((static_cast<uint32_t>(Kind::Character)<<16) |
135  static_cast<uint32_t>(v)) {}
136 
137  /** Create a token representing a range from one character \a from to another character \a to */
138  PToken(uint16_t from,uint16_t to) : m_rep(static_cast<uint32_t>(from)<<16 | to) {}
139 
140  /** Sets the value for a token */
141  void setValue(uint16_t value) { m_rep = (m_rep & 0xFFFF0000) | value; }
142 
143  /** Returns the kind of the token */
144  Kind kind() const { return static_cast<Kind>(m_rep>>16); }
145 
146  /** Returns the 'from' part of the character range. Only valid if this token represents a range */
147  uint16_t from() const { return m_rep>>16; }
148 
149  /** Returns the 'to' part of the character range. Only valid if this token represents a range */
150  uint16_t to() const { return m_rep & 0xFFFF; }
151 
152  /** Returns the value for this token */
153  uint16_t value() const { return m_rep & 0xFFFF; }
154 
155  /** Returns the value for this token as a ASCII character */
156  char asciiValue() const { return static_cast<char>(m_rep); }
157 
158  /** Returns true iff this token represents a range of characters */
159  bool isRange() const { return m_rep!=0 && from()<=to(); }
160 
161  /** Returns true iff this token is a positive or negative character class */
162  bool isCharClass() const { return kind()==Kind::CharClass || kind()==Kind::NegCharClass; }
163 
164  private:
165  uint32_t m_rep;
166 };
167 
168 /** Private members of a regular expression */
170 {
171  public:
172  /** Creates the private part */
173  Private(const std::string &pat) : pattern(pat)
174  {
175  data.reserve(100);
176  }
177  void compile();
178 #if ENABLE_DEBUG
179  void dump();
180 #endif
181  bool matchAt(size_t tokenPos,const std::string &str,Match &match,size_t pos,int level) const;
182 
183  /** Flag indicating the expression was successfully compiled */
184  bool error = false;
185 
186  /** The token stream representing the compiled regular expression. */
187  std::vector<PToken> data; // compiled pattern
188 
189  /** The pattern string as passed by the user */
190  std::string pattern;
191 };
192 
193 /** Compiles a regular expression passed as a string into a stream of tokens that can be used for
194  * efficient searching.
195  */
197 {
198  error = false;
199  data.clear();
200  if (pattern.empty()) return;
201  const char *start = pattern.c_str();
202  const char *ps = start;
203  char c;
204 
205  int prevTokenPos=-1;
206  int tokenPos=0;
207 
208  auto addToken = [&](PToken tok)
209  {
210  tokenPos++;
211  data.emplace_back(tok);
212  };
213 
214  auto getNextCharacter = [&]() -> PToken
215  {
216  char cs=*ps;
217  PToken result = PToken(cs);
218  if (cs=='\\') // escaped character
219  {
220  ps++;
221  cs=*ps;
222  switch (cs)
223  {
224  case 'n': result = PToken('\n'); break;
225  case 'r': result = PToken('\r'); break;
226  case 't': result = PToken('\t'); break;
227  case 's': result = PToken(PToken::Kind::WhiteSpace); break;
228  case 'a': result = PToken(PToken::Kind::Alpha); break;
229  case 'w': result = PToken(PToken::Kind::AlphaNum); break;
230  case 'd': result = PToken(PToken::Kind::Digit); break;
231  case '<': result = PToken(PToken::Kind::BeginOfWord); break;
232  case '>': result = PToken(PToken::Kind::EndOfWord); break;
233  case 'x':
234  case 'X':
235  {
236  uint16_t v=0;
237  for (int i=0;i<2 && (cs=(*(ps+1)));i++) // 2 hex digits
238  {
239  int d = (cs>='a' && cs<='f') ? cs-'a'+10 :
240  (cs>='A' && cs<='F') ? cs-'A'+10 :
241  (cs>='0' && cs<='9') ? cs-'0' :
242  -1;
243  if (d>=0) { v<<=4; v|=d; ps++; } else break;
244  }
245  result = PToken(v);
246  }
247  break;
248  case '\0': ps--; break; // backslash at the end of the pattern
249  default:
250  result = PToken(cs);
251  break;
252  }
253  }
254  return result;
255  };
256 
257  while ((c=*ps))
258  {
259  switch (c)
260  {
261  case '^': // beginning of line (if first character of the pattern)
262  prevTokenPos = tokenPos;
263  addToken(ps==start ? PToken(PToken::Kind::BeginOfLine) :
264  PToken(c));
265  break;
266  case '$': // end of the line (if last character of the pattern)
267  prevTokenPos = tokenPos;
268  addToken(*(ps+1)=='\0' ? PToken(PToken::Kind::EndOfLine) :
269  PToken(c));
270  break;
271  case '.': // any character
272  prevTokenPos = tokenPos;
273  addToken(PToken(PToken::Kind::Any));
274  break;
275  case '(': // begin of capture group
276  prevTokenPos = tokenPos;
278  break;
279  case ')': // end of capture group
280  prevTokenPos = tokenPos;
281  addToken(PToken(PToken::Kind::EndCapture));
282  break;
283  case '[': // character class
284  {
285  prevTokenPos = tokenPos;
286  ps++;
287  if (*ps==0) { error=true; return; }
288  bool esc = *ps=='\\';
289  PToken tok = getNextCharacter();
290  ps++;
291  if (!esc && tok.kind()==PToken::Kind::Character &&
292  tok.asciiValue()=='^') // negated character class
293  {
295  if (*ps==0) { error=true; return; }
296  tok = getNextCharacter();
297  ps++;
298  }
299  else
300  {
301  addToken(PToken(PToken::Kind::CharClass));
302  }
303  uint16_t numTokens=0;
304  while ((c=*ps))
305  {
306  if (c=='-' && *(ps+1)!=']' && *(ps+1)!=0) // range
307  {
308  getNextCharacter();
309  ps++;
310  PToken endTok = getNextCharacter();
311  ps++;
312  if (tok.value()>endTok.value())
313  {
314  addToken(PToken(endTok.value(),tok.value())); // swap start and end
315  }
316  else
317  {
318  addToken(PToken(tok.value(),endTok.value()));
319  }
320  numTokens++;
321  }
322  else // single char, from==to
323  {
324  if (tok.kind()==PToken::Kind::Character)
325  {
326  addToken(PToken(tok.value(),tok.value()));
327  }
328  else // special token, add as-is since from>to
329  {
330  addToken(tok);
331  }
332  numTokens++;
333  }
334  if (*ps==0) { error=true; return; } // expected at least a ]
335  esc = *ps=='\\';
336  tok = getNextCharacter();
337  if (!esc && tok.kind()==PToken::Kind::Character &&
338  tok.value()==static_cast<uint16_t>(']'))
339  {
340  break; // end of character class
341  }
342  if (*ps==0) { error=true; return; } // no ] found
343  ps++;
344  }
345  // set the value of either NegCharClass or CharClass
346  data[prevTokenPos].setValue(numTokens);
347  }
348  break;
349  case '*': // 0 or more
350  case '+': // 1 or more
351  case '?': // optional: 0 or 1
352  {
353  if (prevTokenPos==-1)
354  {
355  error=true;
356  return;
357  }
358  switch (data[prevTokenPos].kind())
359  {
360  case PToken::Kind::BeginOfLine: // $* or $+ or $?
361  case PToken::Kind::BeginOfWord: // <* or <+ or <?
362  case PToken::Kind::EndOfWord: // >* or >+ or >?
363  case PToken::Kind::Star: // ** or *+ or *?
364  case PToken::Kind::Optional: // ?* or ?+ or ??
365  error=true;
366  return;
367  default: // ok
368  break;
369  }
370  int ddiff = static_cast<int>(tokenPos-prevTokenPos);
371  if (*ps=='+') // convert <pat>+ -> <pat><pat>*
372  {
373  // turn a sequence of token [T1...Tn] followed by '+' into [T1..Tn T1..Tn T*]
374  // ddiff=n ^prevTokenPos
375  data.resize(data.size()+ddiff);
376  std::copy_n(data.begin()+prevTokenPos,ddiff,data.begin()+tokenPos);
377  prevTokenPos+=ddiff;
378  tokenPos+=ddiff;
379  }
380  data.insert(data.begin()+prevTokenPos,
382  tokenPos++;
383  addToken(PToken(PToken::Kind::End));
384  // turn a sequence of tokens [T1 T2 T3] followed by 'T*' or into [T* T1 T2 T3 TEND]
385  // ^prevTokenPos
386  // same for 'T?'.
387  }
388  break;
389  default:
390  prevTokenPos = tokenPos;
391  addToken(getNextCharacter());
392  break;
393  }
394  ps++;
395  }
396  //addToken(PToken(PToken::Kind::End));
397 }
398 
399 #if ENABLE_DEBUG
400 /** Dump the compiled token stream for this regular expression. For debugging purposes. */
401 void Ex::Private::dump()
402 {
403  size_t l = data.size();
404  size_t i =0;
405  DBG("==== compiled token stream for pattern '%s' ===\n",pattern.c_str());
406  while (i<l)
407  {
408  DBG("[%s:%04x]\n",data[i].kindStr(),data[i].value());
409  if (data[i].kind()==PToken::Kind::CharClass || data[i].kind()==PToken::Kind::NegCharClass)
410  {
411  uint16_t num = data[i].value();
412  while (num>0 && i<l)
413  {
414  i++;
415  if (data[i].isRange()) // from-to range
416  {
417  DBG("[%04x(%c)-%04x(%c)]\n",data[i].from(),data[i].from(),data[i].to(),data[i].to());
418  }
419  else // special character like \n or \s
420  {
421  DBG("[%s:%04x]\n",data[i].kindStr(),data[i].value());
422  }
423  num--;
424  }
425  }
426  i++;
427  }
428 }
429 #endif
430 
431 /** Internal matching routine.
432  * @param tokenPos Offset into the token stream.
433  * @param str The input string to match against.
434  * @param match The object used to store the matching results.
435  * @param pos The position in the input string to start with matching
436  * @param level Recursion level (used for debugging)
437  */
438 bool Ex::Private::matchAt(size_t tokenPos,const std::string &str,Match &match,const size_t pos,int level) const
439 {
440  DBG("%d:matchAt(tokenPos=%zu, str='%s', pos=%zu)\n",level,tokenPos,str.c_str(),pos);
441  auto isStartIdChar = [](char c) { return isalpha(c) || c=='_'; };
442  auto isIdChar = [](char c) { return isalnum(c) || c=='_'; };
443  auto matchCharClass = [this,isStartIdChar,isIdChar](size_t tp,char c) -> bool
444  {
445  PToken tok = data[tp];
446  bool negate = tok.kind()==PToken::Kind::NegCharClass;
447  uint16_t numFields = tok.value();
448  bool found = false;
449  for (uint16_t i=0;i<numFields;i++)
450  {
451  tok = data[++tp];
452  // first check for built-in ranges
453  if ((tok.kind()==PToken::Kind::Alpha && isStartIdChar(c)) ||
454  (tok.kind()==PToken::Kind::AlphaNum && isIdChar(c)) ||
455  (tok.kind()==PToken::Kind::WhiteSpace && isspace(c)) ||
456  (tok.kind()==PToken::Kind::Digit && isdigit(c))
457  )
458  {
459  found=true;
460  break;
461  }
462  else // user specified range
463  {
464  uint16_t v = static_cast<uint16_t>(c);
465  if (tok.from()<=v && v<=tok.to())
466  {
467  found=true;
468  break;
469  }
470  }
471  }
472  DBG("matchCharClass(tp=%zu,c=%c (x%02x))=%d\n",tp,c,c,negate?!found:found);
473  return negate ? !found : found;
474  };
475  size_t index = pos;
476  enum SequenceType { Star, Optional };
477  auto processSequence = [this,&tokenPos,&index,&str,&matchCharClass,
478  &isStartIdChar,&isIdChar,&match,&level,&pos](SequenceType type) -> bool
479  {
480  size_t startIndex = index;
481  PToken tok = data[++tokenPos];
482  if (tok.kind()==PToken::Kind::Character) // 'x*' -> eat x's
483  {
484  char c_tok = tok.asciiValue();
485  while (index<=str.length() && str[index]==c_tok) { index++; if (type==Optional) break; }
486  tokenPos++;
487  }
488  else if (tok.isCharClass()) // '[a-f0-4]* -> eat matching characters
489  {
490  while (index<=str.length() && matchCharClass(tokenPos,str[index])) { index++; if (type==Optional) break; }
491  tokenPos+=tok.value()+1; // skip over character ranges + end token
492  }
493  else if (tok.kind()==PToken::Kind::Alpha) // '\a*' -> eat start id characters
494  {
495  while (index<=str.length() && isStartIdChar(str[index])) { index++; if (type==Optional) break; }
496  tokenPos++;
497  }
498  else if (tok.kind()==PToken::Kind::AlphaNum) // '\w*' -> eat id characters
499  {
500  while (index<=str.length() && isIdChar(str[index])) { index++; if (type==Optional) break; }
501  tokenPos++;
502  }
503  else if (tok.kind()==PToken::Kind::WhiteSpace) // '\s*' -> eat spaces
504  {
505  while (index<=str.length() && isspace(str[index])) { index++; if (type==Optional) break; }
506  tokenPos++;
507  }
508  else if (tok.kind()==PToken::Kind::Digit) // '\d*' -> eat digits
509  {
510  while (index<=str.length() && isdigit(str[index])) { index++; if (type==Optional) break; }
511  tokenPos++;
512  }
513  else if (tok.kind()==PToken::Kind::Any) // '.*' -> eat all
514  {
515  if (type==Optional) index++; else index = str.length();
516  tokenPos++;
517  }
518  tokenPos++; // skip over end marker
519  while ((int)index>=(int)startIndex)
520  {
521  // pattern 'x*xy' should match 'xy' and 'xxxxy'
522  bool found = matchAt(tokenPos,str,match,index,level+1);
523  if (found)
524  {
525  match.setMatch(pos,index-pos+match.length());
526  return true;
527  }
528  index--;
529  }
530  return false;
531  };
532 
533  while (tokenPos<data.size())
534  {
535  PToken tok = data[tokenPos];
536  //DBG("loop tokenPos=%zu token=%s\n",tokenPos,tok.kindStr());
537  if (tok.kind()==PToken::Kind::Character) // match literal character
538  {
539  char c_tok = tok.asciiValue();
540  if (index>=str.length() || str[index]!=c_tok) return false; // end of string, or non matching char
541  index++,tokenPos++;
542  }
543  else if (tok.isCharClass())
544  {
545  if (index>=str.length() || !matchCharClass(tokenPos,str[index])) return false;
546  index++,tokenPos+=tok.value()+1; // skip over character ranges + end token
547  }
548  else
549  {
550  switch (tok.kind())
551  {
552  case PToken::Kind::Alpha:
553  if (index>=str.length() || !isStartIdChar(str[index])) return false;
554  index++;
555  break;
557  if (index>=str.length() || !isIdChar(str[index])) return false;
558  index++;
559  break;
561  if (index>=str.length() || !isspace(str[index])) return false;
562  index++;
563  break;
564  case PToken::Kind::Digit:
565  if (index>=str.length() || !isdigit(str[index])) return false;
566  index++;
567  break;
569  if (index!=pos) return false;
570  break;
572  if (index<str.length()) return false;
573  break;
575  DBG("BeginOfWord: index=%zu isIdChar(%c)=%d prev.isIdChar(%c)=%d\n",
576  index,str[index],isIdChar(str[index]),
577  index>0?str[index]-1:0,
578  index>0?isIdChar(str[index-1]):-1);
579  if (index>=str.length() ||
580  !isIdChar(str[index]) ||
581  (index>0 && isIdChar(str[index-1]))) return false;
582  break;
584  DBG("EndOfWord: index=%zu pos=%zu idIdChar(%c)=%d prev.isIsChar(%c)=%d\n",
585  index,pos,str[index],isIdChar(str[index]),
586  index==0 ? 0 : str[index-1],
587  index==0 ? -1 : isIdChar(str[index-1]));
588  if (index<str.length() &&
589  (isIdChar(str[index]) || index==0 || !isIdChar(str[index-1]))) return false;
590  break;
592  DBG("BeginCapture(%zu)\n",index);
593  match.startCapture(index);
594  break;
596  DBG("EndCapture(%zu)\n",index);
597  match.endCapture(index);
598  break;
599  case PToken::Kind::Any:
600  if (index>=str.length()) return false;
601  index++;
602  break;
603  case PToken::Kind::Star:
604  return processSequence(Star);
606  return processSequence(Optional);
607  default:
608  return false;
609  }
610  tokenPos++;
611  }
612  }
613  match.setMatch(pos,index-pos);
614  return true;
615 }
616 
617 static std::string wildcard2regex(const std::string &pattern)
618 {
619  std::string result="^"; // match start of input
620  char c;
621  const char *p = pattern.c_str();
622  while ((c=*p++))
623  {
624  switch(c)
625  {
626  case '*':
627  result+=".*";
628  break; // '*' => '.*'
629  case '?':
630  result+='.';
631  break; // '?' => '.'
632  case '.':
633  case '+':
634  case '\\':
635  case '$':
636  case '^':
637  case '(':
638  case ')':
639  result+='\\'; result+=c; // escape
640  break;
641  case '[':
642  if (*p=='^') // don't escape ^ after [
643  {
644  result+="[^";
645  p++;
646  }
647  else
648  {
649  result+=c;
650  }
651  break;
652  default: // just copy
653  result+=c;
654  break;
655  }
656  }
657  result+='$'; // match end of input
658  return result;
659 }
660 
661 
662 Ex::Ex(const std::string &pattern, Mode mode)
663  : p(std::make_unique<Private>(mode==Mode::RegEx ? pattern : wildcard2regex(pattern)))
664 {
665  p->compile();
666 #if ENABLE_DEBUG
667  p->dump();
668  assert(!p->error);
669 #endif
670 }
671 
673 {
674 }
675 
676 bool Ex::match(const std::string &str,Match &match,size_t pos) const
677 {
678  bool found=false;
679  if (p->data.size()==0 || p->error) return found;
680  match.init(&str);
681 
682  PToken tok = p->data[0];
683  if (tok.kind()==PToken::Kind::BeginOfLine) // only test match at the given position
684  {
685  found = p->matchAt(0,str,match,pos,0);
686  }
687  else
688  {
689  if (tok.kind()==PToken::Kind::Character) // search for the start character
690  {
691  size_t index = str.find(tok.asciiValue(),pos);
692  if (index==std::string::npos)
693  {
694  DBG("Ex::match(str='%s',pos=%zu)=false (no start char '%c')\n",str.c_str(),pos,tok.asciiValue());
695  return false;
696  }
697  DBG("pos=%zu str='%s' char='%c' index=%zu\n",index,str.c_str(),tok.asciiValue(),index);
698  pos=index;
699  }
700  while (pos<str.length()) // search for a match starting at pos
701  {
702  found = p->matchAt(0,str,match,pos,0);
703  if (found) break;
704  pos++;
705  }
706  }
707  DBG("Ex::match(str='%s',pos=%zu)=%d\n",str.c_str(),pos,found);
708  return found;
709 }
710 
711 bool Ex::isValid() const
712 {
713  return !p->pattern.empty() && !p->error;
714 }
715 
716 //----------------------------------------------------------------------------------------
717 
718 bool search(const std::string &str,Match &match,const Ex &re,size_t pos)
719 {
720  return re.match(str,match,pos);
721 }
722 
723 bool search(const std::string &str,const Ex &re,size_t pos)
724 {
725  Match match;
726  return re.match(str,match,pos);
727 }
728 
729 bool match(const std::string &str,Match &match,const Ex &re)
730 {
731  return re.match(str,match,0) && match.position()==0 && match.length()==str.length();
732 }
733 
734 bool match(const std::string &str,const Ex &re)
735 {
736  Match match;
737  return re.match(str,match,0) && match.position()==0 && match.length()==str.length();
738 }
739 
740 std::string replace(const std::string &str,const Ex &re,const std::string &replacement)
741 {
742  std::string result;
743  Match match;
744  size_t p=0;
745  while (re.match(str,match,p))
746  {
747  size_t i=match.position();
748  size_t l=match.length();
749  if (i>p) result+=str.substr(p,i-p);
750  result+=replacement;
751  p=i+l;
752  }
753  if (p<str.length()) result+=str.substr(p);
754  return result;
755 }
756 
757 }
reg::PToken::PToken
PToken(uint16_t v)
Create a token for a byte of an UTF-8 character
Definition: regex.cpp:134
reg::Ex::Private::data
std::vector< PToken > data
The token stream representing the compiled regular expression.
Definition: regex.cpp:187
reg::isalpha
static bool isalpha(char c)
Definition: regex.cpp:38
reg::Ex::~Ex
~Ex()
Destroys the regular expression object.
Definition: regex.cpp:672
reg::replace
std::string replace(const std::string &str, const Ex &re, const std::string &replacement)
Searching in a given input string for parts that match regular expression re and replaces those parts...
Definition: regex.cpp:740
reg::PToken::Kind::Alpha
@ Alpha
reg::Ex::Private::pattern
std::string pattern
The pattern string as passed by the user
Definition: regex.cpp:190
reg::Ex::Private::Private
Private(const std::string &pat)
Creates the private part
Definition: regex.cpp:173
reg::PToken::PToken
PToken(Kind k)
Creates a token of the given kind k
Definition: regex.cpp:127
reg::match
bool match(const std::string &str, Match &match, const Ex &re)
Matches a given string str for a match against regular expression re.
Definition: regex.cpp:729
reg::PToken
Class representing a token in the compiled regular expression token stream.
Definition: regex.cpp:58
reg::Ex::isValid
bool isValid() const
Definition: regex.cpp:711
reg::PToken::kind
Kind kind() const
Returns the kind of the token
Definition: regex.cpp:144
reg::PToken::Kind::Any
@ Any
reg::PToken::isCharClass
bool isCharClass() const
Returns true iff this token is a positive or negative character class
Definition: regex.cpp:162
reg::PToken::isRange
bool isRange() const
Returns true iff this token represents a range of characters
Definition: regex.cpp:159
reg::PToken::asciiValue
char asciiValue() const
Returns the value for this token as a ASCII character
Definition: regex.cpp:156
reg::PToken::Kind::End
@ End
reg::PToken::m_rep
uint32_t m_rep
Definition: regex.cpp:165
DBG
#define DBG(fmt,...)
Definition: regex.cpp:27
reg::PToken::Kind::Character
@ Character
reg::PToken::Kind::EndOfWord
@ EndOfWord
reg
Namespace for the regular expression functions
Definition: regex.cpp:30
reg::isdigit
static bool isdigit(char c)
Definition: regex.cpp:43
reg::isspace
static bool isspace(char c)
Definition: regex.cpp:33
reg::PToken::Kind::EndOfLine
@ EndOfLine
reg::Ex::Mode
Mode
Matching algorithm
Definition: regex.h:52
reg::wildcard2regex
static std::string wildcard2regex(const std::string &pattern)
Definition: regex.cpp:617
reg::PToken::to
uint16_t to() const
Returns the 'to' part of the character range.
Definition: regex.cpp:150
reg::Ex::match
bool match(const std::string &str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition: regex.cpp:676
reg::Ex::Private::error
bool error
Flag indicating the expression was successfully compiled
Definition: regex.cpp:184
reg::isalnum
static bool isalnum(char c)
Definition: regex.cpp:48
reg::PToken::Kind::Digit
@ Digit
reg::PToken::Kind::CharClass
@ CharClass
reg::PToken::Kind::AlphaNum
@ AlphaNum
reg::PToken::Kind::Optional
@ Optional
reg::Match
Object representing the matching results.
Definition: regex.h:163
regex.h
reg::Ex::Private::compile
void compile()
Compiles a regular expression passed as a string into a stream of tokens that can be used for efficie...
Definition: regex.cpp:196
reg::PToken::Kind::BeginOfWord
@ BeginOfWord
reg::PToken::Kind::WhiteSpace
@ WhiteSpace
reg::PToken::kindStr
const char * kindStr() const
returns a string representation of the tokens kind (useful for debugging).
Definition: regex.cpp:92
reg::Ex::Private::matchAt
bool matchAt(size_t tokenPos, const std::string &str, Match &match, size_t pos, int level) const
Internal matching routine.
Definition: regex.cpp:438
reg::PToken::Kind::BeginOfLine
@ BeginOfLine
reg::PToken::Kind::Star
@ Star
reg::Ex
Class representing a regular expression.
Definition: regex.h:48
reg::PToken::Kind::BeginCapture
@ BeginCapture
reg::PToken::setValue
void setValue(uint16_t value)
Sets the value for a token
Definition: regex.cpp:141
reg::PToken::Kind::EndCapture
@ EndCapture
reg::PToken::value
uint16_t value() const
Returns the value for this token
Definition: regex.cpp:153
isIdChar
#define isIdChar(i)
Definition: markdown.cpp:174
reg::Ex::Ex
Ex(const std::string &pattern, Mode mode=Mode::RegEx)
Creates a regular expression object given the pattern as a string.
Definition: regex.cpp:662
reg::PToken::Kind
Kind
The kind of token.
Definition: regex.cpp:70
reg::PToken::PToken
PToken(char c)
Create a token for an ASCII character
Definition: regex.cpp:130
reg::PToken::PToken
PToken(uint16_t from, uint16_t to)
Create a token representing a range from one character from to another character to
Definition: regex.cpp:138
reg::search
bool search(const std::string &str, Match &match, const Ex &re, size_t pos)
Search in a given string str starting at position pos for a match against regular expression re.
Definition: regex.cpp:718
reg::Ex::Private
Private members of a regular expression
Definition: regex.cpp:169
reg::PToken::PToken
PToken()
Creates a token of kind 'End'
Definition: regex.cpp:124
reg::PToken::from
uint16_t from() const
Returns the 'from' part of the character range.
Definition: regex.cpp:147
reg::PToken::Kind::NegCharClass
@ NegCharClass
reg::Ex::p
std::unique_ptr< Private > p
Definition: regex.h:124