Doxygen
utf8.cpp
浏览该文件的文档.
1 /******************************************************************************
2  *
3  * Copyright (C) 1997-2021 by Dimitri van Heesch.
4  *
5  * Permission to use, copy, modify, and distribute this software and its
6  * documentation under the terms of the GNU General Public License is hereby
7  * granted. No representations are made about the suitability of this software
8  * for any purpose. It is provided "as is" without express or implied warranty.
9  * See the GNU General Public License for more details.
10  *
11  * Documents produced by Doxygen are derivative works derived from the
12  * input used in their production; they are not affected by this license.
13  *
14  */
15 
16 #include <cstdint>
17 #include <sstream>
18 
19 #include "utf8.h"
20 #include "caseconvert.h"
21 #include "textstream.h"
22 
23 uint8_t getUTF8CharNumBytes(char c)
24 {
25  uint8_t num=1;
26  unsigned char uc = static_cast<unsigned char>(c);
27  if (uc>=0x80u) // multibyte character
28  {
29  if ((uc&0xE0u)==0xC0u)
30  {
31  num=2; // 110x.xxxx: 2 byte character
32  }
33  if ((uc&0xF0u)==0xE0u)
34  {
35  num=3; // 1110.xxxx: 3 byte character
36  }
37  if ((uc&0xF8u)==0xF0u)
38  {
39  num=4; // 1111.0xxx: 4 byte character
40  }
41  if ((uc&0xFCu)==0xF8u)
42  {
43  num=5; // 1111.10xx: 5 byte character
44  }
45  if ((uc&0xFEu)==0xFCu)
46  {
47  num=6; // 1111.110x: 6 byte character
48  }
49  }
50  return num;
51 }
52 
53 //! Decodes a given input of utf8 data to a unicode code point
54 //! given the number of bytes it's made of
55 static inline uint32_t decode_utf8( const char* data , int numBytes ) noexcept
56 {
57  uint32_t cp = (unsigned char)*data;
58  if (numBytes>1)
59  {
60  cp &= 0x7F >> numBytes; // Mask out the header bits
61  for (int i=1 ; i<numBytes ; i++)
62  {
63  cp = (cp<<6) | ((unsigned char)data[i]&0x3F);
64  }
65  }
66  return cp;
67 }
68 
69 static inline uint32_t convertUTF8CharToUnicode(const char *s,size_t bytesLeft,int &len)
70 {
71  if (s==0 || bytesLeft==0)
72  {
73  len=0;
74  return 0;
75  }
76  unsigned char uc = static_cast<unsigned char>(*s);
77  if (uc<128) // ASCII case
78  {
79  len=1;
80  return uc;
81  }
82  switch (bytesLeft)
83  {
84  default:
85  if ((uc&0xFEu)==0xFCu)// 1111110X six bytes
86  {
87  len=6;
88  return decode_utf8(s,len);
89  }
90  // fall through
91  case 5:
92  if ((uc&0xFCu)==0xF8u) // 111110XX five bytes
93  {
94  len=5;
95  return decode_utf8(s,len);
96  }
97  // fall through
98  case 4:
99  if ((uc&0xF8u)==0xF0u) // 11110XXX four bytes
100  {
101  len=4;
102  return decode_utf8(s,len);
103  }
104  // fall through
105  case 3:
106  if ((uc&0xF0u)==0xE0u) // 1110XXXX three bytes
107  {
108  len=3;
109  return decode_utf8(s,len);
110  }
111  // fall through
112  case 2:
113  if ((uc&0xE0u)==0xC0u) // 110XXXXX two bytes
114  {
115  len=2;
116  return decode_utf8(s,len);
117  }
118  // fall through
119  case 1:
120  {
121  len=1;
122  return uc;
123  }
124  }
125 }
126 
127 std::string getUTF8CharAt(const std::string &input,size_t pos)
128 {
129  if (input.length()<=pos) return std::string();
130  int numBytes=getUTF8CharNumBytes(input[pos]);
131  if (input.length()<pos+numBytes) return std::string();
132  return input.substr(pos,pos+numBytes);
133 }
134 
135 uint32_t getUnicodeForUTF8CharAt(const std::string &input,size_t pos)
136 {
137  std::string charS = getUTF8CharAt(input,pos);
138  int len;
139  return convertUTF8CharToUnicode(charS.c_str(),charS.length(),len);
140 }
141 
142 static inline char asciiToLower(uint32_t code)
143 {
144  return code>='A' && code<='Z' ? (char)(code+'a'-'A') : (char)code;
145 }
146 
147 static inline char asciiToUpper(uint32_t code)
148 {
149  return code>='a' && code<='z' ? (char)(code+'A'-'a') : (char)code;
150 }
151 
152 static inline std::string caseConvert(const std::string &input,
153  char (*asciiConversionFunc)(uint32_t code),
154  const char *(*conversionFunc)(uint32_t code))
155 {
156  uint32_t code;
157  std::string result;
158  result.reserve(input.length()); // assume all ASCII characters
159  int len;
160  size_t bytesLeft = input.length();
161  const char *p = input.c_str();
162  while ((code=convertUTF8CharToUnicode(p,bytesLeft,len)))
163  {
164  if (code<128) // ASCII case
165  {
166  char c = asciiConversionFunc(code);
167  result+=c;
168  }
169  else // generic case
170  {
171  const char *conv = conversionFunc(code);
172  if (conv==nullptr) // no difference between lower and upper case
173  {
174  result.append(p,len);
175  }
176  else // replace the input character with the conversion result
177  {
178  result.append(conv);
179  }
180  }
181  p+=len;
182  bytesLeft-=len;
183  }
184  return result;
185 }
186 
187 std::string convertUTF8ToLower(const std::string &input)
188 {
190 }
191 
192 std::string convertUTF8ToUpper(const std::string &input)
193 {
195 }
196 
197 const char *writeUTF8Char(TextStream &t,const char *s)
198 {
199  if (s==0) return 0;
200  uint8_t len = getUTF8CharNumBytes(*s);
201  for (uint8_t i=0;i<len;i++)
202  {
203  if (s[i]==0) // detect premature end of string (due to invalid UTF8 char)
204  {
205  len=i;
206  }
207  }
208  t.write(s,len);
209  return s+len;
210 }
211 
212 bool lastUTF8CharIsMultibyte(const std::string &input)
213 {
214  // last byte is part of a multibyte UTF8 char if bit 8 is set and bit 7 is not
215  return !input.empty() && (((unsigned char)input[input.length()-1])&0xC0)==0x80;
216 }
217 
218 bool isUTF8CharUpperCase(const std::string &input,size_t pos)
219 {
220  if (input.length()<=pos) return false;
221  int len;
222  // turn the UTF8 character at position pos into a unicode value
223  uint32_t code = convertUTF8CharToUnicode(input.c_str()+pos,input.length()-pos,len);
224  // check if the character can be converted to lower case, if so it was an upper case character
225  return convertUnicodeToLower(code)!=nullptr;
226 }
227 
228 int isUTF8NonBreakableSpace(const char *input)
229 {
230  return (static_cast<unsigned char>(input[0])==0xC2 &&
231  static_cast<unsigned char>(input[1])==0xA0) ? 2 : 0;
232 }
233 
caseconvert.h
convertUnicodeToUpper
const char * convertUnicodeToUpper(uint32_t code)
Definition: caseconvert.h:12
convertUTF8ToLower
std::string convertUTF8ToLower(const std::string &input)
Converts the input string into a lower case version, also taking into account non-ASCII characters th...
Definition: utf8.cpp:187
isUTF8CharUpperCase
bool isUTF8CharUpperCase(const std::string &input, size_t pos)
Returns true iff the input string at byte position pos holds an upper case character.
Definition: utf8.cpp:218
textstream.h
asciiToUpper
static char asciiToUpper(uint32_t code)
Definition: utf8.cpp:147
TextStream
Text streaming class that buffers data.
Definition: textstream.h:33
caseConvert
static std::string caseConvert(const std::string &input, char(*asciiConversionFunc)(uint32_t code), const char *(*conversionFunc)(uint32_t code))
Definition: utf8.cpp:152
decode_utf8
static uint32_t decode_utf8(const char *data, int numBytes) noexcept
Decodes a given input of utf8 data to a unicode code point given the number of bytes it's made of
Definition: utf8.cpp:55
lastUTF8CharIsMultibyte
bool lastUTF8CharIsMultibyte(const std::string &input)
Returns true iff the last character in input is a multibyte character.
Definition: utf8.cpp:212
getUTF8CharAt
std::string getUTF8CharAt(const std::string &input, size_t pos)
Returns the UTF8 character found at byte position pos in the input string.
Definition: utf8.cpp:127
convertUTF8CharToUnicode
static uint32_t convertUTF8CharToUnicode(const char *s, size_t bytesLeft, int &len)
Definition: utf8.cpp:69
utf8.h
Various UTF8 related helper functions.
writeUTF8Char
const char * writeUTF8Char(TextStream &t, const char *s)
Writes the UTF8 character pointed to by s to stream t and returns a pointer to the next character.
Definition: utf8.cpp:197
getUTF8CharNumBytes
uint8_t getUTF8CharNumBytes(char c)
Returns the number of bytes making up a single UTF8 character given the first byte in the sequence.
Definition: utf8.cpp:23
isUTF8NonBreakableSpace
int isUTF8NonBreakableSpace(const char *input)
Check if the first character pointed at by input is a non-breakable whitespace character.
Definition: utf8.cpp:228
convertUnicodeToLower
const char * convertUnicodeToLower(uint32_t code)
Definition: caseconvert.h:1505
convertUTF8ToUpper
std::string convertUTF8ToUpper(const std::string &input)
Converts the input string into a upper case version, also taking into account non-ASCII characters th...
Definition: utf8.cpp:192
getUnicodeForUTF8CharAt
uint32_t getUnicodeForUTF8CharAt(const std::string &input, size_t pos)
Returns the 32bit Unicode value matching character at byte position pos in the UTF8 encoded input.
Definition: utf8.cpp:135
asciiToLower
static char asciiToLower(uint32_t code)
Definition: utf8.cpp:142
TextStream::write
void write(const char *buf, size_t len)
Adds a array of character to the stream
Definition: textstream.h:180