strdup8to16.cpp 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. /* libs/cutils/strdup8to16.c
  2. **
  3. ** Copyright 2006, The Android Open Source Project
  4. **
  5. ** Licensed under the Apache License, Version 2.0 (the "License");
  6. ** you may not use this file except in compliance with the License.
  7. ** You may obtain a copy of the License at
  8. **
  9. ** http://www.apache.org/licenses/LICENSE-2.0
  10. **
  11. ** Unless required by applicable law or agreed to in writing, software
  12. ** distributed under the License is distributed on an "AS IS" BASIS,
  13. ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. ** See the License for the specific language governing permissions and
  15. ** limitations under the License.
  16. */
  17. #include <cutils/jstring.h>
  18. #include <assert.h>
  19. #include <limits.h>
  20. #include <stdlib.h>
  21. /* See http://www.unicode.org/reports/tr22/ for discussion
  22. * on invalid sequences
  23. */
  24. #define UTF16_REPLACEMENT_CHAR 0xfffd
  25. /* Clever trick from Dianne that returns 1-4 depending on leading bit sequence*/
  26. #define UTF8_SEQ_LENGTH(ch) (((0xe5000000 >> (((ch) >> 3) & 0x1e)) & 3) + 1)
  27. /* note: macro expands to multiple lines */
  28. #define UTF8_SHIFT_AND_MASK(unicode, byte) \
  29. (unicode)<<=6; (unicode) |= (0x3f & (byte));
  30. #define UNICODE_UPPER_LIMIT 0x10fffd
  31. /**
  32. * out_len is an out parameter (which may not be null) containing the
  33. * length of the UTF-16 string (which may contain embedded \0's)
  34. */
  35. extern char16_t * strdup8to16 (const char* s, size_t *out_len)
  36. {
  37. char16_t *ret;
  38. size_t len;
  39. if (s == NULL) return NULL;
  40. len = strlen8to16(s);
  41. // fail on overflow
  42. if (len && SIZE_MAX/len < sizeof(char16_t))
  43. return NULL;
  44. // no plus-one here. UTF-16 strings are not null terminated
  45. ret = (char16_t *) malloc (sizeof(char16_t) * len);
  46. return strcpy8to16 (ret, s, out_len);
  47. }
  48. /**
  49. * Like "strlen", but for strings encoded with Java's modified UTF-8.
  50. *
  51. * The value returned is the number of UTF-16 characters required
  52. * to represent this string.
  53. */
  54. extern size_t strlen8to16 (const char* utf8Str)
  55. {
  56. size_t len = 0;
  57. int ic;
  58. int expected = 0;
  59. while ((ic = *utf8Str++) != '\0') {
  60. /* bytes that start 0? or 11 are lead bytes and count as characters.*/
  61. /* bytes that start 10 are extention bytes and are not counted */
  62. if ((ic & 0xc0) == 0x80) {
  63. /* count the 0x80 extention bytes. if we have more than
  64. * expected, then start counting them because strcpy8to16
  65. * will insert UTF16_REPLACEMENT_CHAR's
  66. */
  67. expected--;
  68. if (expected < 0) {
  69. len++;
  70. }
  71. } else {
  72. len++;
  73. expected = UTF8_SEQ_LENGTH(ic) - 1;
  74. /* this will result in a surrogate pair */
  75. if (expected == 3) {
  76. len++;
  77. }
  78. }
  79. }
  80. return len;
  81. }
  82. /*
  83. * Retrieve the next UTF-32 character from a UTF-8 string.
  84. *
  85. * Stops at inner \0's
  86. *
  87. * Returns UTF16_REPLACEMENT_CHAR if an invalid sequence is encountered
  88. *
  89. * Advances "*pUtf8Ptr" to the start of the next character.
  90. */
  91. static inline uint32_t getUtf32FromUtf8(const char** pUtf8Ptr)
  92. {
  93. uint32_t ret;
  94. int seq_len;
  95. int i;
  96. /* Mask for leader byte for lengths 1, 2, 3, and 4 respectively*/
  97. static const unsigned char leaderMask[4] = {0xff, 0x1f, 0x0f, 0x07};
  98. /* Bytes that start with bits "10" are not leading characters. */
  99. if (((**pUtf8Ptr) & 0xc0) == 0x80) {
  100. (*pUtf8Ptr)++;
  101. return UTF16_REPLACEMENT_CHAR;
  102. }
  103. /* note we tolerate invalid leader 11111xxx here */
  104. seq_len = UTF8_SEQ_LENGTH(**pUtf8Ptr);
  105. ret = (**pUtf8Ptr) & leaderMask [seq_len - 1];
  106. if (**pUtf8Ptr == '\0') return ret;
  107. (*pUtf8Ptr)++;
  108. for (i = 1; i < seq_len ; i++, (*pUtf8Ptr)++) {
  109. if ((**pUtf8Ptr) == '\0') return UTF16_REPLACEMENT_CHAR;
  110. if (((**pUtf8Ptr) & 0xc0) != 0x80) return UTF16_REPLACEMENT_CHAR;
  111. UTF8_SHIFT_AND_MASK(ret, **pUtf8Ptr);
  112. }
  113. return ret;
  114. }
  115. /**
  116. * out_len is an out parameter (which may not be null) containing the
  117. * length of the UTF-16 string (which may contain embedded \0's)
  118. */
  119. extern char16_t * strcpy8to16 (char16_t *utf16Str, const char*utf8Str,
  120. size_t *out_len)
  121. {
  122. char16_t *dest = utf16Str;
  123. while (*utf8Str != '\0') {
  124. uint32_t ret;
  125. ret = getUtf32FromUtf8(&utf8Str);
  126. if (ret <= 0xffff) {
  127. *dest++ = (char16_t) ret;
  128. } else if (ret <= UNICODE_UPPER_LIMIT) {
  129. /* Create surrogate pairs */
  130. /* See http://en.wikipedia.org/wiki/UTF-16/UCS-2#Method_for_code_points_in_Plane_1.2C_Plane_2 */
  131. *dest++ = 0xd800 | ((ret - 0x10000) >> 10);
  132. *dest++ = 0xdc00 | ((ret - 0x10000) & 0x3ff);
  133. } else {
  134. *dest++ = UTF16_REPLACEMENT_CHAR;
  135. }
  136. }
  137. *out_len = dest - utf16Str;
  138. return utf16Str;
  139. }
  140. /**
  141. * length is the number of characters in the UTF-8 string.
  142. * out_len is an out parameter (which may not be null) containing the
  143. * length of the UTF-16 string (which may contain embedded \0's)
  144. */
  145. extern char16_t * strcpylen8to16 (char16_t *utf16Str, const char*utf8Str,
  146. int length, size_t *out_len)
  147. {
  148. /* TODO: Share more of this code with the method above. Only 2 lines changed. */
  149. char16_t *dest = utf16Str;
  150. const char *end = utf8Str + length; /* This line */
  151. while (utf8Str < end) { /* and this line changed. */
  152. uint32_t ret;
  153. ret = getUtf32FromUtf8(&utf8Str);
  154. if (ret <= 0xffff) {
  155. *dest++ = (char16_t) ret;
  156. } else if (ret <= UNICODE_UPPER_LIMIT) {
  157. /* Create surrogate pairs */
  158. /* See http://en.wikipedia.org/wiki/UTF-16/UCS-2#Method_for_code_points_in_Plane_1.2C_Plane_2 */
  159. *dest++ = 0xd800 | ((ret - 0x10000) >> 10);
  160. *dest++ = 0xdc00 | ((ret - 0x10000) & 0x3ff);
  161. } else {
  162. *dest++ = UTF16_REPLACEMENT_CHAR;
  163. }
  164. }
  165. *out_len = dest - utf16Str;
  166. return utf16Str;
  167. }