TextEncoding.h 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. //
  2. // TextEncoding.h
  3. //
  4. // Library: Foundation
  5. // Package: Text
  6. // Module: TextEncoding
  7. //
  8. // Definition of the abstract TextEncoding class.
  9. //
  10. // Copyright (c) 2004-2007, Applied Informatics Software Engineering GmbH.
  11. // and Contributors.
  12. //
  13. // SPDX-License-Identifier: BSL-1.0
  14. //
  15. #ifndef Foundation_TextEncoding_INCLUDED
  16. #define Foundation_TextEncoding_INCLUDED
  17. #include "Poco/Foundation.h"
  18. #include "Poco/SharedPtr.h"
  19. namespace Poco {
  20. class TextEncodingManager;
  21. class Foundation_API TextEncoding
  22. /// An abstract base class for implementing text encodings
  23. /// like UTF-8 or ISO 8859-1.
  24. ///
  25. /// Subclasses must override the canonicalName(), isA(),
  26. /// characterMap() and convert() methods and need to be
  27. /// thread safe and stateless.
  28. ///
  29. /// TextEncoding also provides static member functions
  30. /// for managing mappings from encoding names to
  31. /// TextEncoding objects.
  32. {
  33. public:
  34. typedef SharedPtr<TextEncoding> Ptr;
  35. enum
  36. {
  37. MAX_SEQUENCE_LENGTH = 4 /// The maximum character byte sequence length supported.
  38. };
  39. typedef int CharacterMap[256];
  40. /// The map[b] member gives information about byte sequences
  41. /// whose first byte is b.
  42. /// If map[b] is c where c is >= 0, then b by itself encodes the Unicode scalar value c.
  43. /// If map[b] is -1, then the byte sequence is malformed.
  44. /// If map[b] is -n, where n >= 2, then b is the first byte of an n-byte
  45. /// sequence that encodes a single Unicode scalar value. Byte sequences up
  46. /// to 4 bytes in length are supported.
  47. virtual ~TextEncoding();
  48. /// Destroys the encoding.
  49. virtual const char* canonicalName() const = 0;
  50. /// Returns the canonical name of this encoding,
  51. /// e.g. "ISO-8859-1". Encoding name comparisons are case
  52. /// insensitive.
  53. virtual bool isA(const std::string& encodingName) const = 0;
  54. /// Returns true if the given name is one of the names of this encoding.
  55. /// For example, the "ISO-8859-1" encoding is also known as "Latin-1".
  56. ///
  57. /// Encoding name comparisions are case insensitive.
  58. virtual const CharacterMap& characterMap() const = 0;
  59. /// Returns the CharacterMap for the encoding.
  60. /// The CharacterMap should be kept in a static member. As
  61. /// characterMap() can be called frequently, it should be
  62. /// implemented in such a way that it just returns a static
  63. /// map. If the map is built at runtime, this should be
  64. /// done in the constructor.
  65. virtual int convert(const unsigned char* bytes) const;
  66. /// The convert function is used to convert multibyte sequences;
  67. /// bytes will point to a byte sequence of n bytes where
  68. /// sequenceLength(bytes, length) == -n, with length >= n.
  69. ///
  70. /// The convert function must return the Unicode scalar value
  71. /// represented by this byte sequence or -1 if the byte sequence is malformed.
  72. ///
  73. /// The default implementation returns (int) bytes[0].
  74. virtual int queryConvert(const unsigned char* bytes, int length) const;
  75. /// The queryConvert function is used to convert single byte characters
  76. /// or multibyte sequences;
  77. /// bytes will point to a byte sequence of length bytes.
  78. ///
  79. /// The queryConvert function must return the Unicode scalar value
  80. /// represented by this byte sequence or -1 if the byte sequence is malformed
  81. /// or -n where n is number of bytes requested for the sequence, if length is
  82. /// shorter than the sequence.
  83. /// The length of the sequence might not be determined by the first byte,
  84. /// in which case the conversion becomes an iterative process:
  85. /// First call with length == 1 might return -2,
  86. /// Then a second call with length == 2 might return -4
  87. /// Eventually, the third call with length == 4 should return either a
  88. /// Unicode scalar value, or -1 if the byte sequence is malformed.
  89. ///
  90. /// The default implementation returns (int) bytes[0].
  91. virtual int sequenceLength(const unsigned char* bytes, int length) const;
  92. /// The sequenceLength function is used to get the lenth of the sequence pointed
  93. /// by bytes. The length parameter should be greater or equal to the length of
  94. /// the sequence.
  95. ///
  96. /// The sequenceLength function must return the length of the sequence
  97. /// represented by this byte sequence or a negative value -n if length is
  98. /// shorter than the sequence, where n is the number of byte requested
  99. /// to determine the length of the sequence.
  100. /// The length of the sequence might not be determined by the first byte,
  101. /// in which case the conversion becomes an iterative process as long as the
  102. /// result is negative:
  103. /// First call with length == 1 might return -2,
  104. /// Then a second call with length == 2 might return -4
  105. /// Eventually, the third call with length == 4 should return 4.
  106. /// The default implementation returns 1.
  107. virtual int convert(int ch, unsigned char* bytes, int length) const;
  108. /// Transform the Unicode character ch into the encoding's
  109. /// byte sequence. The method returns the number of bytes
  110. /// used. The method must not use more than length characters.
  111. /// Bytes and length can also be null - in this case only the number
  112. /// of bytes required to represent ch is returned.
  113. /// If the character cannot be converted, 0 is returned and
  114. /// the byte sequence remains unchanged.
  115. /// The default implementation simply returns 0.
  116. static TextEncoding& byName(const std::string& encodingName);
  117. /// Returns the TextEncoding object for the given encoding name.
  118. ///
  119. /// Throws a NotFoundException if the encoding with given name is not available.
  120. static TextEncoding::Ptr find(const std::string& encodingName);
  121. /// Returns a pointer to the TextEncoding object for the given encodingName,
  122. /// or NULL if no such TextEncoding object exists.
  123. static void add(TextEncoding::Ptr encoding);
  124. /// Adds the given TextEncoding to the table of text encodings,
  125. /// under the encoding's canonical name.
  126. ///
  127. /// If an encoding with the given name is already registered,
  128. /// it is replaced.
  129. static void add(TextEncoding::Ptr encoding, const std::string& name);
  130. /// Adds the given TextEncoding to the table of text encodings,
  131. /// under the given name.
  132. ///
  133. /// If an encoding with the given name is already registered,
  134. /// it is replaced.
  135. static void remove(const std::string& encodingName);
  136. /// Removes the encoding with the given name from the table
  137. /// of text encodings.
  138. static TextEncoding::Ptr global(TextEncoding::Ptr encoding);
  139. /// Sets global TextEncoding object.
  140. ///
  141. /// This function sets the global encoding to the argument and returns a
  142. /// reference of the previous global encoding.
  143. static TextEncoding& global();
  144. /// Return the current global TextEncoding object
  145. static const std::string GLOBAL;
  146. /// Name of the global TextEncoding, which is the empty string.
  147. protected:
  148. static TextEncodingManager& manager();
  149. /// Returns the TextEncodingManager.
  150. };
  151. } // namespace Poco
  152. #endif // Foundation_TextEncoding_INCLUDED