Doxygen
regex.h
浏览该文件的文档.
1 /******************************************************************************
2  *
3  * Copyright (C) 1997-2021 by Dimitri van Heesch.
4  *
5  * Permission to use, copy, modify, and distribute this software and its
6  * documentation under the terms of the GNU General Public License is hereby
7  * granted. No representations are made about the suitability of this software
8  * for any purpose. It is provided "as is" without express or implied warranty.
9  * See the GNU General Public License for more details.
10  *
11  * Documents produced by Doxygen are derivative works derived from the
12  * input used in their production; they are not affected by this license.
13  *
14  */
15 
16 #ifndef FREGEX_H
17 #define FREGEX_H
18 
19 #include <memory>
20 #include <string>
21 #include <vector>
22 #include <iterator>
23 
24 /** Namespace for the regular expression functions */
25 namespace reg
26 {
27 
28 class Match;
29 
30 /** Class representing a regular expression.
31  *
32  * It has a similar API as `std::regex`,
33  * but is much faster (and also somewhat more limited).
34  */
35 class Ex
36 {
37  public:
38  /** Matching algorithm */
39  enum class Mode
40  {
41  RegEx, /**< full regular expression. */
42  Wildcard /**< simple globbing pattern. */
43  };
44  /** Creates a regular expression object given the pattern as a string.
45  * Two modes of matching are supported: RegEx and Wildcard
46  *
47  * The following special characters are supported in Mode::RegEx mode.
48  * - `c` matches character `c`
49  * - `.` matches any character
50  * - `^` matches the start of the input
51  * - `$` matches the end of the input
52  * - `<` matches the start of a word
53  * - `>` matches the end of a word
54  * - `[]` matches a set of characters
55  * - `x*` matches a sequence of zero or more `x`'s
56  * - `x+` matches a sequence of one or more `x`'s
57  * - `x?` matches an optional `x`
58  * - `(` matches the start of a capture range
59  * - `)` matches the ends a capture range
60  * - `\c` to escape a special character, such as `+`, `[`, `*`, `(`, etc.
61  * - `\t` matches a tab character
62  * - `\n` matches a newline character
63  * - `\r` matches a return character
64  * - `\s` matches any whitespace as defined by `std::isspace()`
65  * - `\d` matches any digit as defined by `std::digit()`
66  * - `\a` matches any alphabetical characters, same as `[a-z_A-Z\x80-\xFF]`
67  * - `\w` matches any alpha numercial character, same as `[a-z_A-Z0-9\x80-\xFF]`
68  * - `\xHH` matches a hexadecimal character, e.g. `\xA0` matches character code 160.
69  *
70  * A character range can be used to match a character that falls inside a range
71  * (or set of ranges).
72  * Within the opening `[` and closing `]` brackets of a character ranges the following
73  * is supported:
74  * - `^` if at the start of the range, a character matches if it is \e not in the range,
75  * e.g. `[^\d]` matches any character not a digit
76  * - `-` when placed between 2 characters it defines a range from the first character to the second.
77  * any character that falls in the range will match, e.g. [0-9] matches the digit from 0 to 9.
78  * - `\s`, `\d`, `\a`, and `\w` as explained above.
79  *
80  * @note that special characters `.`, `*`, `?`, `$`, `+`, `[` do not have a special
81  * meaning in a character range. `^` only has a special meaning as the first character.
82  *
83  * @note that capture ranges cannot be nested, and `*`, `+`, and `?` do not work on
84  * capture ranges. e.g. `(abd)?` is not valid. If multiple capture ranges are
85  * specified then some character has to be in between them,
86  * e.g. this does not work `(.*)(a.*)`, but this does `(.*)a(.*)`.
87  *
88  * In Wildcard mode `*` is used to match any sequence of zero or more characters.
89  * The character `?` can be used to match an optional character. Character ranges are
90  * also supported, but other characters like `$` and `+` are just treated as
91  * literal characters.
92  *
93  */
94  Ex(const std::string &pattern, Mode mode=Mode::RegEx);
95 
96  /** Destroys the regular expression object. Frees resources. */
97  ~Ex();
98 
99  /** Check if a given string matches this regular expression.
100  * @param str The input string to match against.
101  * @param match The match object to hold the matching results.
102  * @param pos The position in the string at which to start the match.
103  * @returns true iff a match is found. Details are stored in the match object.
104  */
105  bool match(const std::string &str,Match &match,size_t pos=0) const;
106  bool isValid() const;
107  private:
108  Ex(const Ex &) = delete;
109  Ex &operator=(const Ex &e) = delete;
110 
111  class Private;
112  std::unique_ptr<Private> p;
113 };
114 
115 /** Object representing the match results of a capture range. */
116 class SubMatch
117 {
118  public:
119  /** Creates a match for a single capture range given a non-owning pointer to the string. */
120  SubMatch(const std::string *str) : m_str(str) {}
121 
122  /** Returns the position in the string at which the match starts. */
123  size_t position() const { return m_pos; }
124 
125  /** Returns the length of the matching part. */
126  size_t length() const { return m_len; }
127 
128  /** Returns the matching part as a string */
129  std::string str() const { return m_str ? m_str->substr(m_pos,m_len) : std::string(); }
130 
131  private:
132  friend class Match;
133  void setStart(size_t pos) { m_pos=pos; }
134  void setEnd(size_t pos) { m_len=pos-m_pos; }
135  void setMatch(size_t pos,size_t len) { m_pos=pos; m_len=len; }
136  size_t m_pos = std::string::npos;
137  size_t m_len = std::string::npos;
138  const std::string *m_str = nullptr;
139 };
140 
141 /** Object representing the matching results. It consists of an array of
142  * SubMatch objects. The first entry of the array represents the whole match, any
143  * next elements represent each of the capture ranges.
144  *
145  * For example string `@42` and expression `@(\\d+)` will have two
146  * Submatches, match[0] will point to the input string as a whole, and
147  * match[1] will point to the number 42 only.
148  *
149  */
150 class Match
151 {
152  public:
153  /** Creates an empty match object */
154  Match() {}
155 
156  /** Returns the position of the match or std::string::npos if no position is set. */
157  size_t position() const { return m_subMatches[0].position(); }
158 
159  /** Returns the position of the match or std::string::npos if no length is set. */
160  size_t length() const { return m_subMatches[0].length(); }
161 
162  /** Return a string representing the matching part. */
163  std::string str() const { return m_subMatches[0].str(); }
164 
165  /** Return the part of the string before the match */
166  SubMatch prefix() const { SubMatch m(m_str); m.setMatch(0,position()); return m; }
167 
168  /** Return the part of the string after the match */
169  SubMatch suffix() const
170  {
171  SubMatch m(m_str);
172  if (m_str)
173  {
174  size_t e = position()+length();
175  m.setMatch(e,m_str->length()-e);
176  }
177  return m;
178  }
179 
180  /** Returns the number of sub matches available in this match. */
181  size_t size() const { return m_subMatches.size(); }
182 
183  /** Returns the n-th SubMatch object. Note that there is always 1 SubMatch object
184  * representing the whole match.
185  */
186  const SubMatch &operator[](size_t index) const { return m_subMatches[index]; }
187 
188  private:
189  friend class Ex;
190  void init(const std::string *str)
191  {
192  m_subMatches.clear();
193  m_subMatches.emplace_back(str);
195  }
196  void startCapture(size_t index)
197  {
198  if (!m_insideCapture) // when backtracking we can re-entry the capture multiple times
199  // only update the index, example `\s*(x)`
200  {
201  m_captureIndex = m_subMatches.size();
202  m_subMatches.emplace_back(m_str);
204  }
205  m_subMatches.back().setStart(index);
206  }
207  void endCapture(size_t index)
208  {
209  if (index>m_subMatches.back().position())
210  {
211  m_captureIndex=0;
212  m_subMatches.back().setEnd(index);
213  m_insideCapture = false;
214  }
215  }
216  void setMatch(size_t pos,size_t len)
217  {
218  m_subMatches[m_captureIndex].setMatch(pos,len);
219  }
220 
221  std::vector<SubMatch> m_subMatches;
222  size_t m_captureIndex=0;
223  const std::string *m_str = nullptr;
224  bool m_insideCapture=false;
225 };
226 
227 /** Iterator class to iterator through matches.
228  */
229 class Iterator
230 {
231  public:
232  using value_type = Match;
233  using difference_type = std::ptrdiff_t;
234  using pointer = value_type*;
236  using iterator_category = std::forward_iterator_tag;
237 
238  /** Creates an end-of-sequence iterator */
239  Iterator() {}
240 
241  /** Creates an iterator for input string \a str, using regular expression \a re to search.
242  * @note the string and regular expression objects should remain valid while iterating.
243  */
244  Iterator(const std::string &str, const Ex &re, size_t pos=0)
245  : m_str(&str), m_re(&re), m_pos(pos) { findNext(); }
246 
247  // Iterator holds pointers, so prevent temporaries to be passed as string or
248  // regular expression
249  Iterator(std::string &&str, const Ex &re) = delete;
250  Iterator(const std::string &str, Ex &&re) = delete;
251  Iterator(std::string &&str, Ex &&re) = delete;
252 
253  /** Returns true if the iterators point to the same match (or both are end-of-sequence iterators) */
254  bool operator==(const Iterator &rhs) const { return rhs.m_pos==m_pos; }
255 
256  /** Returns true if the iterators are not pointing to the same match */
257  bool operator!=(const Iterator &rhs) const { return rhs.m_pos!=m_pos; }
258 
259  /** Returns a reference to the current match */
260  const value_type &operator*() const { return m_match; }
261 
262  /** Returns a pointer to the current match */
263  const value_type *operator->() const { return &m_match; }
264 
265  /** Advances the iterator to the next match. */
266  Iterator &operator++() { findNext(); return *this; }
267 
268  private:
269  void findNext()
270  {
271  if (!m_re || !m_str) { m_pos=std::string::npos; return; } // end marker
272  if (m_re->match(*m_str,m_match,m_pos))
273  {
274  m_pos=m_match.position()+m_match.length(); // update m_pos to point beyond last match
275  }
276  else // no more matches, make the iterator point to the 'end-of-sequence'
277  {
278  m_pos=std::string::npos;
279  }
280  }
281  const std::string *m_str = nullptr;
282  const Ex *m_re = nullptr;
283  size_t m_pos = std::string::npos;
284  Match m_match;
285 };
286 
287 /** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
288  * Returns true iff a match was found.
289  * Details of what part of the string has matched is returned via the \a match object.
290  *
291  * An example to show how to match all identifiers in a string.
292  * @code
293  * static reg::Ex re(R"(\a\w*)");
294  * std::string = u8"void(Func是<B_C::Códe42>(42));";
295  * while (reg::search(str,match,re,pos))
296  * {
297  * std::cout << match.str() << std::endl;
298  * pos=match.position()+match.length();
299  * }
300  * @endcode
301  * produces:
302  * @code
303  * void
304  * Func是
305  * B_C
306  * Códe42
307  * @endcode
308  *
309  * @see Ex::Ex() for details on the regular expression patterns.
310  */
311 bool search(const std::string &str,Match &match,const Ex &re,size_t pos=0);
312 
313 /** Search in a given string \a str starting at position \a pos for a match against regular expression \a re.
314  * Returns true iff a match was found.
315  */
316 bool search(const std::string &str,const Ex &re,size_t pos=0);
317 
318 /** Matches a given string \a str for a match against regular expression \a re.
319  * Returns true iff a match was found for the whole string.
320  * Any capture groups are returned via the \a match object.
321  */
322 bool match(const std::string &str,Match &match,const Ex &re);
323 
324 /** Matches a given string \a str for a match against regular expression \a re.
325  * Returns true iff a match was found for the whole string.
326  */
327 bool match(const std::string &str,const Ex &re);
328 
329 /** Searching in a given input string \a for parts that match regular expression \a re and
330  * replaces those parts by string \a replacement.
331  */
332 std::string replace(const std::string &str,const Ex &re,const std::string &replacement);
333 
334 } // namespace
335 
336 #endif
reg::SubMatch::setMatch
void setMatch(size_t pos, size_t len)
Definition: regex.h:148
reg::Match::operator[]
const SubMatch & operator[](size_t index) const
Returns the n-th SubMatch object.
Definition: regex.h:199
reg::SubMatch::str
std::string str() const
Returns the matching part as a string
Definition: regex.h:142
reg::Match::str
std::string str() const
Return a string representing the matching part.
Definition: regex.h:176
reg::Ex::~Ex
~Ex()
Destroys the regular expression object.
Definition: regex.cpp:672
reg::Iterator::m_re
const Ex * m_re
Definition: regex.h:295
reg::replace
std::string replace(const std::string &str, const Ex &re, const std::string &replacement)
Searching in a given input string for parts that match regular expression re and replaces those parts...
Definition: regex.cpp:740
reg::Iterator::operator!=
bool operator!=(const Iterator &rhs) const
Returns true if the iterators are not pointing to the same match
Definition: regex.h:270
reg::Match::prefix
SubMatch prefix() const
Return the part of the string before the match
Definition: regex.h:179
reg::SubMatch::setStart
void setStart(size_t pos)
Definition: regex.h:146
reg::match
bool match(const std::string &str, Match &match, const Ex &re)
Matches a given string str for a match against regular expression re.
Definition: regex.cpp:729
reg::SubMatch::length
size_t length() const
Returns the length of the matching part.
Definition: regex.h:139
reg::Ex::isValid
bool isValid() const
Definition: regex.cpp:711
reg::Match::endCapture
void endCapture(size_t index)
Definition: regex.h:220
reg::Match::length
size_t length() const
Returns the position of the match or std::string::npos if no length is set.
Definition: regex.h:173
reg::Iterator::m_match
Match m_match
Definition: regex.h:297
reg::Ex::Mode::Wildcard
@ Wildcard
simple globbing pattern.
reg::Match::setMatch
void setMatch(size_t pos, size_t len)
Definition: regex.h:229
reg::Iterator::operator==
bool operator==(const Iterator &rhs) const
Returns true if the iterators point to the same match (or both are end-of-sequence iterators)
Definition: regex.h:267
reg::Match::Match
Match()
Creates an empty match object
Definition: regex.h:167
reg::Match::init
void init(const std::string *str)
Definition: regex.h:203
reg::SubMatch::m_str
const std::string * m_str
Definition: regex.h:151
reg::Iterator
Iterator class to iterator through matches.
Definition: regex.h:242
reg
Namespace for the regular expression functions
Definition: regex.cpp:30
reg::Match::startCapture
void startCapture(size_t index)
Definition: regex.h:209
reg::SubMatch::position
size_t position() const
Returns the position in the string at which the match starts.
Definition: regex.h:136
reg::SubMatch
Object representing the match results of a capture range.
Definition: regex.h:129
reg::Ex::Mode
Mode
Matching algorithm
Definition: regex.h:52
reg::Ex::match
bool match(const std::string &str, Match &match, size_t pos=0) const
Check if a given string matches this regular expression.
Definition: regex.cpp:676
reg::Iterator::operator++
Iterator & operator++()
Advances the iterator to the next match.
Definition: regex.h:279
reg::Iterator::Iterator
Iterator()
Creates an end-of-sequence iterator
Definition: regex.h:252
reg::SubMatch::setEnd
void setEnd(size_t pos)
Definition: regex.h:147
reg::SubMatch::m_pos
size_t m_pos
Definition: regex.h:149
reg::Match::m_insideCapture
bool m_insideCapture
Definition: regex.h:237
reg::Match::position
size_t position() const
Returns the position of the match or std::string::npos if no position is set.
Definition: regex.h:170
reg::Match
Object representing the matching results.
Definition: regex.h:163
reg::Iterator::difference_type
std::ptrdiff_t difference_type
Definition: regex.h:246
reg::Iterator::pointer
value_type * pointer
Definition: regex.h:247
reg::Iterator::m_str
const std::string * m_str
Definition: regex.h:294
reg::Iterator::reference
value_type & reference
Definition: regex.h:248
reg::Iterator::iterator_category
std::forward_iterator_tag iterator_category
Definition: regex.h:249
reg::SubMatch::SubMatch
SubMatch(const std::string *str)
Creates a match for a single capture range given a non-owning pointer to the string.
Definition: regex.h:133
reg::Match::m_subMatches
std::vector< SubMatch > m_subMatches
Definition: regex.h:234
reg::Iterator::operator*
const value_type & operator*() const
Returns a reference to the current match
Definition: regex.h:273
reg::Iterator::m_pos
size_t m_pos
Definition: regex.h:296
reg::Iterator::findNext
void findNext()
Definition: regex.h:282
reg::Match::size
size_t size() const
Returns the number of sub matches available in this match.
Definition: regex.h:194
reg::Ex
Class representing a regular expression.
Definition: regex.h:48
reg::Match::suffix
SubMatch suffix() const
Return the part of the string after the match
Definition: regex.h:182
reg::Ex::Mode::RegEx
@ RegEx
full regular expression.
reg::SubMatch::m_len
size_t m_len
Definition: regex.h:150
reg::Iterator::value_type
Match value_type
Definition: regex.h:245
reg::Match::m_captureIndex
size_t m_captureIndex
Definition: regex.h:235
reg::Ex::Ex
Ex(const std::string &pattern, Mode mode=Mode::RegEx)
Creates a regular expression object given the pattern as a string.
Definition: regex.cpp:662
reg::search
bool search(const std::string &str, Match &match, const Ex &re, size_t pos)
Search in a given string str starting at position pos for a match against regular expression re.
Definition: regex.cpp:718
reg::Iterator::operator->
const value_type * operator->() const
Returns a pointer to the current match
Definition: regex.h:276
reg::Ex::Private
Private members of a regular expression
Definition: regex.cpp:169
reg::Match::m_str
const std::string * m_str
Definition: regex.h:236
reg::Ex::operator=
Ex & operator=(const Ex &e)=delete
reg::Ex::p
std::unique_ptr< Private > p
Definition: regex.h:124