123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327 |
- //
- // Unicode.h
- //
- // Library: Foundation
- // Package: Text
- // Module: Unicode
- //
- // Definition of the Unicode class.
- //
- // Copyright (c) 2007, Applied Informatics Software Engineering GmbH.
- // and Contributors.
- //
- // SPDX-License-Identifier: BSL-1.0
- //
- #ifndef Foundation_Unicode_INCLUDED
- #define Foundation_Unicode_INCLUDED
- #include "Poco/Foundation.h"
- namespace Poco {
- class Foundation_API Unicode
- /// This class contains enumerations and static
- /// utility functions for dealing with Unicode characters
- /// and their properties.
- ///
- /// For more information on Unicode, see <http://www.unicode.org>.
- ///
- /// The implementation is based on the Unicode support
- /// functions in PCRE.
- {
- public:
- // Implementation note: the following definitions must be kept
- // in sync with those from ucp.h (PCRE).
- enum CharacterCategory
- /// Unicode character categories.
- {
- UCP_OTHER,
- UCP_LETTER,
- UCP_MARK,
- UCP_NUMBER,
- UCP_PUNCTUATION,
- UCP_SYMBOL,
- UCP_SEPARATOR
- };
- enum CharacterType
- /// Unicode character types.
- {
- UCP_CONTROL,
- UCP_FORMAT,
- UCP_UNASSIGNED,
- UCP_PRIVATE_USE,
- UCP_SURROGATE,
- UCP_LOWER_CASE_LETTER,
- UCP_MODIFIER_LETTER,
- UCP_OTHER_LETTER,
- UCP_TITLE_CASE_LETTER,
- UCP_UPPER_CASE_LETTER,
- UCP_SPACING_MARK,
- UCP_ENCLOSING_MARK,
- UCP_NON_SPACING_MARK,
- UCP_DECIMAL_NUMBER,
- UCP_LETTER_NUMBER,
- UCP_OTHER_NUMBER,
- UCP_CONNECTOR_PUNCTUATION,
- UCP_DASH_PUNCTUATION,
- UCP_CLOSE_PUNCTUATION,
- UCP_FINAL_PUNCTUATION,
- UCP_INITIAL_PUNCTUATION,
- UCP_OTHER_PUNCTUATION,
- UCP_OPEN_PUNCTUATION,
- UCP_CURRENCY_SYMBOL,
- UCP_MODIFIER_SYMBOL,
- UCP_MATHEMATICAL_SYMBOL,
- UCP_OTHER_SYMBOL,
- UCP_LINE_SEPARATOR,
- UCP_PARAGRAPH_SEPARATOR,
- UCP_SPACE_SEPARATOR
- };
-
- enum Script
- /// Unicode 7.0 script identifiers.
- {
- UCP_ARABIC,
- UCP_ARMENIAN,
- UCP_BENGALI,
- UCP_BOPOMOFO,
- UCP_BRAILLE,
- UCP_BUGINESE,
- UCP_BUHID,
- UCP_CANADIAN_ABORIGINAL,
- UCP_CHEROKEE,
- UCP_COMMON,
- UCP_COPTIC,
- UCP_CYPRIOT,
- UCP_CYRILLIC,
- UCP_DESERET,
- UCP_DEVANAGARI,
- UCP_ETHIOPIC,
- UCP_GEORGIAN,
- UCP_GLAGOLITIC,
- UCP_GOTHIC,
- UCP_GREEK,
- UCP_GUJARATI,
- UCP_GURMUKHI,
- UCP_HAN,
- UCP_HANGUL,
- UCP_HANUNOO,
- UCP_HEBREW,
- UCP_HIRAGANA,
- UCP_INHERITED,
- UCP_KANNADA,
- UCP_KATAKANA,
- UCP_KHAROSHTHI,
- UCP_KHMER,
- UCP_LAO,
- UCP_LATIN,
- UCP_LIMBU,
- UCP_LINEAR_B,
- UCP_MALAYALAM,
- UCP_MONGOLIAN,
- UCP_MYANMAR,
- UCP_NEW_TAI_LUE,
- UCP_OGHAM,
- UCP_OLD_ITALIC,
- UCP_OLD_PERSIAN,
- UCP_ORIYA,
- UCP_OSMANYA,
- UCP_RUNIC,
- UCP_SHAVIAN,
- UCP_SINHALA,
- UCP_SYLOTI_NAGRI,
- UCP_SYRIAC,
- UCP_TAGALOG,
- UCP_TAGBANWA,
- UCP_TAI_LE,
- UCP_TAMIL,
- UCP_TELUGU,
- UCP_THAANA,
- UCP_THAI,
- UCP_TIBETAN,
- UCP_TIFINAGH,
- UCP_UGARITIC,
- UCP_YI,
- // Unicode 5.0
- UCP_BALINESE,
- UCP_CUNEIFORM,
- UCP_NKO,
- UCP_PHAGS_PA,
- UCP_PHOENICIAN,
- // Unicode 5.1
- UCP_CARIAN,
- UCP_CHAM,
- UCP_KAYAH_LI,
- UCP_LEPCHA,
- UCP_LYCIAN,
- UCP_LYDIAN,
- UCP_OL_CHIKI,
- UCP_REJANG,
- UCP_SAURASHTRA,
- UCP_SUNDANESE,
- UCP_VAI,
- // Unicode 5.2
- UCP_AVESTAN,
- UCP_BAMUM,
- UCP_EGYPTIAN_HIEROGLYPHS,
- UCP_IMPERIAL_ARAMAIC,
- UCP_INSCRIPTIONAL_PAHLAVI,
- UCP_INSCRIPTIONAL_PARTHIAN,
- UCP_JAVANESE,
- UCP_KAITHI,
- UCP_LISU,
- UCP_MEETEI_MAYEK,
- UCP_OLD_SOUTH_ARABIAN,
- UCP_OLD_TURKIC,
- UCP_SAMARITAN,
- UCP_TAI_THAM,
- UCP_TAI_VIET,
- // Unicode 6.0
- UCP_BATAK,
- UCP_BRAHMI,
- UCP_MANDAIC,
- // Unicode 6.1
- UCP_CHAKMA,
- UCP_MEROITIC_CURSIVE,
- UCP_MEROITIC_HIEROGLYPHS,
- UCP_MIAO,
- UCP_SHARADA,
- UCP_SORA_SOMPENG,
- UCP_TAKRI,
- // Unicode 7.0
- UCP_BASSA_VAH,
- UCP_CAUCASIAN_ALBANIAN,
- UCP_DUPLOYAN,
- UCP_ELBASAN,
- UCP_GRANTHA,
- UCP_KHOJKI,
- UCP_KHUDAWADI,
- UCP_LINEAR_A,
- UCP_MAHAJANI,
- UCP_MANICHAEAN,
- UCP_MENDE_KIKAKUI,
- UCP_MODI,
- UCP_MRO,
- UCP_NABATAEAN,
- UCP_OLD_NORTH_ARABIAN,
- UCP_OLD_PERMIC,
- UCP_PAHAWH_HMONG,
- UCP_PALMYRENE,
- UCP_PSALTER_PAHLAVI,
- UCP_PAU_CIN_HAU,
- UCP_SIDDHAM,
- UCP_TIRHUTA,
- UCP_WARANG_CITI
- };
-
- enum
- {
- UCP_MAX_CODEPOINT = 0x10FFFF
- };
-
- struct CharacterProperties
- /// This structure holds the character properties
- /// of an Unicode character.
- {
- CharacterCategory category;
- CharacterType type;
- Script script;
- };
- static void properties(int ch, CharacterProperties& props);
- /// Return the Unicode character properties for the
- /// character with the given Unicode value.
-
- static bool isSpace(int ch);
- /// Returns true iff the given character is a separator.
-
- static bool isDigit(int ch);
- /// Returns true iff the given character is a numeric character.
-
- static bool isPunct(int ch);
- /// Returns true iff the given character is a punctuation character.
-
- static bool isAlpha(int ch);
- /// Returns true iff the given character is a letter.
-
- static bool isLower(int ch);
- /// Returns true iff the given character is a lowercase
- /// character.
-
- static bool isUpper(int ch);
- /// Returns true iff the given character is an uppercase
- /// character.
-
- static int toLower(int ch);
- /// If the given character is an uppercase character,
- /// return its lowercase counterpart, otherwise return
- /// the character.
- static int toUpper(int ch);
- /// If the given character is a lowercase character,
- /// return its uppercase counterpart, otherwise return
- /// the character.
- };
- //
- // inlines
- //
- inline bool Unicode::isSpace(int ch)
- {
- CharacterProperties props;
- properties(ch, props);
- return props.category == UCP_SEPARATOR;
- }
- inline bool Unicode::isDigit(int ch)
- {
- CharacterProperties props;
- properties(ch, props);
- return props.category == UCP_NUMBER;
- }
- inline bool Unicode::isPunct(int ch)
- {
- CharacterProperties props;
- properties(ch, props);
- return props.category == UCP_PUNCTUATION;
- }
- inline bool Unicode::isAlpha(int ch)
- {
- CharacterProperties props;
- properties(ch, props);
- return props.category == UCP_LETTER;
- }
- inline bool Unicode::isLower(int ch)
- {
- CharacterProperties props;
- properties(ch, props);
- return props.category == UCP_LETTER && props.type == UCP_LOWER_CASE_LETTER;
- }
-
- inline bool Unicode::isUpper(int ch)
- {
- CharacterProperties props;
- properties(ch, props);
- return props.category == UCP_LETTER && props.type == UCP_UPPER_CASE_LETTER;
- }
- } // namespace Poco
- #endif // Foundation_Unicode_INCLUDED
|