Doxygen
utf8.h
浏览该文件的文档.
1 /******************************************************************************
2  *
3  * Copyright (C) 1997-2021 by Dimitri van Heesch.
4  *
5  * Permission to use, copy, modify, and distribute this software and its
6  * documentation under the terms of the GNU General Public License is hereby
7  * granted. No representations are made about the suitability of this software
8  * for any purpose. It is provided "as is" without express or implied warranty.
9  * See the GNU General Public License for more details.
10  *
11  * Documents produced by Doxygen are derivative works derived from the
12  * input used in their production; they are not affected by this license.
13  *
14  */
15 
16 #ifndef UTF8_H
17 #define UTF8_H
18 
19 #include <cstdint>
20 #include <string>
21 
22 class TextStream;
23 
24 /** @file
25  * @brief Various UTF8 related helper functions.
26  *
27  * See https://en.wikipedia.org/wiki/UTF-8 for details on UTF8 encoding.
28  */
29 
30 
31 /** Converts the input string into a lower case version, also taking into account
32  * non-ASCII characters that has a lower case variant.
33  */
34 std::string convertUTF8ToLower(const std::string &input);
35 
36 /** Converts the input string into a upper case version, also taking into account
37  * non-ASCII characters that has a upper case variant.
38  */
39 std::string convertUTF8ToUpper(const std::string &input);
40 
41 /** Returns the UTF8 character found at byte position pos in the input string.
42  * The resulting string can be a multi byte sequence.
43  */
44 std::string getUTF8CharAt(const std::string &input,size_t pos);
45 
46 /** Returns the 32bit Unicode value matching character at byte position pos in
47  * the UTF8 encoded input.
48  */
49 uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos);
50 
51 /** Returns the number of bytes making up a single UTF8 character given the first byte
52  * in the sequence.
53  */
54 uint8_t getUTF8CharNumBytes(char firstByte);
55 
56 /** Writes the UTF8 character pointed to by s to stream t and returns a pointer
57  * to the next character.
58  */
59 const char *writeUTF8Char(TextStream &t,const char *s);
60 
61 /** Returns true iff the last character in input is a multibyte character. */
62 bool lastUTF8CharIsMultibyte(const std::string &input);
63 
64 /** Returns true iff the input string at byte position pos holds an upper case character. */
65 bool isUTF8CharUpperCase(const std::string &input,size_t pos);
66 
67 /** Check if the first character pointed at by input is a non-breakable whitespace character.
68  * Returns the byte size of the character if there is match or 0 if not.
69  */
70 int isUTF8NonBreakableSpace(const char *input);
71 
72 #endif
lastUTF8CharIsMultibyte
bool lastUTF8CharIsMultibyte(const std::string &input)
Returns true iff the last character in input is a multibyte character.
Definition: utf8.cpp:212
convertUTF8ToUpper
std::string convertUTF8ToUpper(const std::string &input)
Converts the input string into a upper case version, also taking into account non-ASCII characters th...
Definition: utf8.cpp:192
TextStream
Text streaming class that buffers data.
Definition: textstream.h:33
isUTF8CharUpperCase
bool isUTF8CharUpperCase(const std::string &input, size_t pos)
Returns true iff the input string at byte position pos holds an upper case character.
Definition: utf8.cpp:218
getUTF8CharNumBytes
uint8_t getUTF8CharNumBytes(char firstByte)
Returns the number of bytes making up a single UTF8 character given the first byte in the sequence.
Definition: utf8.cpp:23
isUTF8NonBreakableSpace
int isUTF8NonBreakableSpace(const char *input)
Check if the first character pointed at by input is a non-breakable whitespace character.
Definition: utf8.cpp:228
getUTF8CharAt
std::string getUTF8CharAt(const std::string &input, size_t pos)
Returns the UTF8 character found at byte position pos in the input string.
Definition: utf8.cpp:127
getUnicodeForUTF8CharAt
uint32_t getUnicodeForUTF8CharAt(const std::string &input, size_t pos)
Returns the 32bit Unicode value matching character at byte position pos in the UTF8 encoded input.
Definition: utf8.cpp:135
convertUTF8ToLower
std::string convertUTF8ToLower(const std::string &input)
Converts the input string into a lower case version, also taking into account non-ASCII characters th...
Definition: utf8.cpp:187
writeUTF8Char
const char * writeUTF8Char(TextStream &t, const char *s)
Writes the UTF8 character pointed to by s to stream t and returns a pointer to the next character.
Definition: utf8.cpp:197