Unicode.h 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. //
  2. // Unicode.h
  3. //
  4. // Library: Foundation
  5. // Package: Text
  6. // Module: Unicode
  7. //
  8. // Definition of the Unicode class.
  9. //
  10. // Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
  11. // and Contributors.
  12. //
  13. // SPDX-License-Identifier: BSL-1.0
  14. //
  15. #ifndef Foundation_Unicode_INCLUDED
  16. #define Foundation_Unicode_INCLUDED
  17. #include "Poco/Foundation.h"
  18. namespace Poco {
  19. class Foundation_API Unicode
  20. /// This class contains enumerations and static
  21. /// utility functions for dealing with Unicode characters
  22. /// and their properties.
  23. ///
  24. /// For more information on Unicode, see <http://www.unicode.org>.
  25. ///
  26. /// The implementation is based on the Unicode support
  27. /// functions in PCRE.
  28. {
  29. public:
  30. // Implementation note: the following definitions must be kept
  31. // in sync with those from ucp.h (PCRE).
  32. enum CharacterCategory
  33. /// Unicode character categories.
  34. {
  35. UCP_OTHER,
  36. UCP_LETTER,
  37. UCP_MARK,
  38. UCP_NUMBER,
  39. UCP_PUNCTUATION,
  40. UCP_SYMBOL,
  41. UCP_SEPARATOR
  42. };
  43. enum CharacterType
  44. /// Unicode character types.
  45. {
  46. UCP_CONTROL,
  47. UCP_FORMAT,
  48. UCP_UNASSIGNED,
  49. UCP_PRIVATE_USE,
  50. UCP_SURROGATE,
  51. UCP_LOWER_CASE_LETTER,
  52. UCP_MODIFIER_LETTER,
  53. UCP_OTHER_LETTER,
  54. UCP_TITLE_CASE_LETTER,
  55. UCP_UPPER_CASE_LETTER,
  56. UCP_SPACING_MARK,
  57. UCP_ENCLOSING_MARK,
  58. UCP_NON_SPACING_MARK,
  59. UCP_DECIMAL_NUMBER,
  60. UCP_LETTER_NUMBER,
  61. UCP_OTHER_NUMBER,
  62. UCP_CONNECTOR_PUNCTUATION,
  63. UCP_DASH_PUNCTUATION,
  64. UCP_CLOSE_PUNCTUATION,
  65. UCP_FINAL_PUNCTUATION,
  66. UCP_INITIAL_PUNCTUATION,
  67. UCP_OTHER_PUNCTUATION,
  68. UCP_OPEN_PUNCTUATION,
  69. UCP_CURRENCY_SYMBOL,
  70. UCP_MODIFIER_SYMBOL,
  71. UCP_MATHEMATICAL_SYMBOL,
  72. UCP_OTHER_SYMBOL,
  73. UCP_LINE_SEPARATOR,
  74. UCP_PARAGRAPH_SEPARATOR,
  75. UCP_SPACE_SEPARATOR
  76. };
  77. enum Script
  78. /// Unicode 7.0 script identifiers.
  79. {
  80. UCP_ARABIC,
  81. UCP_ARMENIAN,
  82. UCP_BENGALI,
  83. UCP_BOPOMOFO,
  84. UCP_BRAILLE,
  85. UCP_BUGINESE,
  86. UCP_BUHID,
  87. UCP_CANADIAN_ABORIGINAL,
  88. UCP_CHEROKEE,
  89. UCP_COMMON,
  90. UCP_COPTIC,
  91. UCP_CYPRIOT,
  92. UCP_CYRILLIC,
  93. UCP_DESERET,
  94. UCP_DEVANAGARI,
  95. UCP_ETHIOPIC,
  96. UCP_GEORGIAN,
  97. UCP_GLAGOLITIC,
  98. UCP_GOTHIC,
  99. UCP_GREEK,
  100. UCP_GUJARATI,
  101. UCP_GURMUKHI,
  102. UCP_HAN,
  103. UCP_HANGUL,
  104. UCP_HANUNOO,
  105. UCP_HEBREW,
  106. UCP_HIRAGANA,
  107. UCP_INHERITED,
  108. UCP_KANNADA,
  109. UCP_KATAKANA,
  110. UCP_KHAROSHTHI,
  111. UCP_KHMER,
  112. UCP_LAO,
  113. UCP_LATIN,
  114. UCP_LIMBU,
  115. UCP_LINEAR_B,
  116. UCP_MALAYALAM,
  117. UCP_MONGOLIAN,
  118. UCP_MYANMAR,
  119. UCP_NEW_TAI_LUE,
  120. UCP_OGHAM,
  121. UCP_OLD_ITALIC,
  122. UCP_OLD_PERSIAN,
  123. UCP_ORIYA,
  124. UCP_OSMANYA,
  125. UCP_RUNIC,
  126. UCP_SHAVIAN,
  127. UCP_SINHALA,
  128. UCP_SYLOTI_NAGRI,
  129. UCP_SYRIAC,
  130. UCP_TAGALOG,
  131. UCP_TAGBANWA,
  132. UCP_TAI_LE,
  133. UCP_TAMIL,
  134. UCP_TELUGU,
  135. UCP_THAANA,
  136. UCP_THAI,
  137. UCP_TIBETAN,
  138. UCP_TIFINAGH,
  139. UCP_UGARITIC,
  140. UCP_YI,
  141. // Unicode 5.0
  142. UCP_BALINESE,
  143. UCP_CUNEIFORM,
  144. UCP_NKO,
  145. UCP_PHAGS_PA,
  146. UCP_PHOENICIAN,
  147. // Unicode 5.1
  148. UCP_CARIAN,
  149. UCP_CHAM,
  150. UCP_KAYAH_LI,
  151. UCP_LEPCHA,
  152. UCP_LYCIAN,
  153. UCP_LYDIAN,
  154. UCP_OL_CHIKI,
  155. UCP_REJANG,
  156. UCP_SAURASHTRA,
  157. UCP_SUNDANESE,
  158. UCP_VAI,
  159. // Unicode 5.2
  160. UCP_AVESTAN,
  161. UCP_BAMUM,
  162. UCP_EGYPTIAN_HIEROGLYPHS,
  163. UCP_IMPERIAL_ARAMAIC,
  164. UCP_INSCRIPTIONAL_PAHLAVI,
  165. UCP_INSCRIPTIONAL_PARTHIAN,
  166. UCP_JAVANESE,
  167. UCP_KAITHI,
  168. UCP_LISU,
  169. UCP_MEETEI_MAYEK,
  170. UCP_OLD_SOUTH_ARABIAN,
  171. UCP_OLD_TURKIC,
  172. UCP_SAMARITAN,
  173. UCP_TAI_THAM,
  174. UCP_TAI_VIET,
  175. // Unicode 6.0
  176. UCP_BATAK,
  177. UCP_BRAHMI,
  178. UCP_MANDAIC,
  179. // Unicode 6.1
  180. UCP_CHAKMA,
  181. UCP_MEROITIC_CURSIVE,
  182. UCP_MEROITIC_HIEROGLYPHS,
  183. UCP_MIAO,
  184. UCP_SHARADA,
  185. UCP_SORA_SOMPENG,
  186. UCP_TAKRI,
  187. // Unicode 7.0
  188. UCP_BASSA_VAH,
  189. UCP_CAUCASIAN_ALBANIAN,
  190. UCP_DUPLOYAN,
  191. UCP_ELBASAN,
  192. UCP_GRANTHA,
  193. UCP_KHOJKI,
  194. UCP_KHUDAWADI,
  195. UCP_LINEAR_A,
  196. UCP_MAHAJANI,
  197. UCP_MANICHAEAN,
  198. UCP_MENDE_KIKAKUI,
  199. UCP_MODI,
  200. UCP_MRO,
  201. UCP_NABATAEAN,
  202. UCP_OLD_NORTH_ARABIAN,
  203. UCP_OLD_PERMIC,
  204. UCP_PAHAWH_HMONG,
  205. UCP_PALMYRENE,
  206. UCP_PSALTER_PAHLAVI,
  207. UCP_PAU_CIN_HAU,
  208. UCP_SIDDHAM,
  209. UCP_TIRHUTA,
  210. UCP_WARANG_CITI
  211. };
  212. enum
  213. {
  214. UCP_MAX_CODEPOINT = 0x10FFFF
  215. };
  216. struct CharacterProperties
  217. /// This structure holds the character properties
  218. /// of an Unicode character.
  219. {
  220. CharacterCategory category;
  221. CharacterType type;
  222. Script script;
  223. };
  224. static void properties(int ch, CharacterProperties& props);
  225. /// Return the Unicode character properties for the
  226. /// character with the given Unicode value.
  227. static bool isSpace(int ch);
  228. /// Returns true iff the given character is a separator.
  229. static bool isDigit(int ch);
  230. /// Returns true iff the given character is a numeric character.
  231. static bool isPunct(int ch);
  232. /// Returns true iff the given character is a punctuation character.
  233. static bool isAlpha(int ch);
  234. /// Returns true iff the given character is a letter.
  235. static bool isLower(int ch);
  236. /// Returns true iff the given character is a lowercase
  237. /// character.
  238. static bool isUpper(int ch);
  239. /// Returns true iff the given character is an uppercase
  240. /// character.
  241. static int toLower(int ch);
  242. /// If the given character is an uppercase character,
  243. /// return its lowercase counterpart, otherwise return
  244. /// the character.
  245. static int toUpper(int ch);
  246. /// If the given character is a lowercase character,
  247. /// return its uppercase counterpart, otherwise return
  248. /// the character.
  249. };
  250. //
  251. // inlines
  252. //
  253. inline bool Unicode::isSpace(int ch)
  254. {
  255. CharacterProperties props;
  256. properties(ch, props);
  257. return props.category == UCP_SEPARATOR;
  258. }
  259. inline bool Unicode::isDigit(int ch)
  260. {
  261. CharacterProperties props;
  262. properties(ch, props);
  263. return props.category == UCP_NUMBER;
  264. }
  265. inline bool Unicode::isPunct(int ch)
  266. {
  267. CharacterProperties props;
  268. properties(ch, props);
  269. return props.category == UCP_PUNCTUATION;
  270. }
  271. inline bool Unicode::isAlpha(int ch)
  272. {
  273. CharacterProperties props;
  274. properties(ch, props);
  275. return props.category == UCP_LETTER;
  276. }
  277. inline bool Unicode::isLower(int ch)
  278. {
  279. CharacterProperties props;
  280. properties(ch, props);
  281. return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
  282. }
  283. inline bool Unicode::isUpper(int ch)
  284. {
  285. CharacterProperties props;
  286. properties(ch, props);
  287. return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
  288. }
  289. } // namespace Poco
  290. #endif // Foundation_Unicode_INCLUDED