drivers/staging/csr/csr_utf16.c

   1 /*****************************************************************************
   2
   3             (c) Cambridge Silicon Radio Limited 2010
   4             All rights reserved and confidential information of CSR
   5
   6             Refer to LICENSE.txt included with this source for details
   7             on the license terms.
   8
   9 *****************************************************************************/
  10 #include <linux/module.h>
  11 #include "csr_types.h"
  12 #include "csr_pmem.h"
  13 #include "csr_unicode.h"
  14 #include "csr_util.h"
  15
  16 #define UNI_SUR_HIGH_START   ((CsrUint32) 0xD800)
  17 #define UNI_SUR_HIGH_END     ((CsrUint32) 0xDBFF)
  18 #define UNI_SUR_LOW_START    ((CsrUint32) 0xDC00)
  19 #define UNI_SUR_LOW_END      ((CsrUint32) 0xDFFF)
  20 #define UNI_REPLACEMENT_CHAR ((CsrUint32) 0xFFFD)
  21 #define UNI_HALF_SHIFT       ((CsrUint8) 10)  /* used for shifting by 10 bits */
  22 #define UNI_HALF_BASE        ((CsrUint32) 0x00010000)
  23 #define UNI_BYTEMASK         ((CsrUint32) 0xBF)
  24 #define UNI_BYTEMARK         ((CsrUint32) 0x80)
  25
  26 #define CAPITAL(x)    ((x >= 'a') && (x <= 'z') ? ((x) & 0x00DF) : (x))
  27
  28 /*
  29 *  Index into the table with the first byte to get the number of trailing bytes in a utf-8 character.
  30 *  -1 if the byte has an invalid value.
  31 *
  32 *  Legal sequences are:
  33 *
  34 *  byte  1st      2nd      3rd      4th
  35 *
  36 *       00-7F
  37 *       C2-DF    80-BF
  38 *       E0       A0-BF    80-BF
  39 *       E1-EC    80-BF    80-BF
  40 *       ED       80-9F    80-BF
  41 *       EE-EF    80-BF    80-BF
  42 *       F0       90-BF    80-BF    80-BF
  43 *       F1-F3    80-BF    80-BF    80-BF
  44 *       F4       80-8F    80-BF    80-BF
  45 */
  46 static const CsrInt8 trailingBytesForUtf8[256] =
  47 {
  48     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,                                 /* 0x00 - 0x1F */
  49     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,                                 /* 0x20 - 0x3F */
  50     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,                                 /* 0x40 - 0x5F */
  51     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,                                 /* 0x60 - 0x7F */
  52     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 0x80 - 0x9F */
  53     -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, /* 0xA0 - 0xBF */
  54     -1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,                               /* 0xC0 - 0xDF */
  55     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,                      /* 0xE0 - 0xFF */
  56 };
  57
  58 /* Values to be substracted from a CsrUint32 when converting from UTF8 to UTF16 */
  59 static const CsrUint32 offsetsFromUtf8[4] =
  60 {
  61     0x00000000, 0x00003080, 0x000E2080, 0x03C82080
  62 };
  63
  64 /********************************************************************************
  65 *
  66 *   Name:           CsrUint32ToUtf16String
  67 *
  68 *   Description:    The function converts an 32 bit number to an UTF-16 string
  69 *                   that is allocated and 0-terminated.
  70 *
  71 *   Input:          32 bit number.
  72 *
  73 *   Output:         A string of UTF-16 characters.
  74 *
  75 *********************************************************************************/
  76 CsrUtf16String *CsrUint32ToUtf16String(CsrUint32 number)
  77 {
  78     CsrUint16 count, noOfDigits;
  79     CsrUtf16String *output;
  80     CsrUint32 tempNumber;
  81
  82     /* calculate the number of digits in the output */
  83     tempNumber = number;
  84     noOfDigits = 1;
  85     while (tempNumber >= 10)
  86     {
  87         tempNumber = tempNumber / 10;
  88         noOfDigits++;
  89     }
  90
  91     output = (CsrUtf16String *) CsrPmemAlloc(sizeof(CsrUtf16String) * (noOfDigits + 1)); /*add space for 0-termination*/
  92
  93     tempNumber = number;
  94     for (count = noOfDigits; count > 0; count--)
  95     {
  96         output[count - 1] = (CsrUtf16String) ((tempNumber % 10) + '0');
  97         tempNumber = tempNumber / 10;
  98     }
  99     output[noOfDigits] = '\0';
 100
 101     return output;
 102 }
 103
 104 /********************************************************************************
 105 *
 106 *   Name:           CsrUtf16StringToUint32
 107 *
 108 *   Description:    The function converts an UTF-16 string that is
 109 *                   0-terminated into a 32 bit number.
 110 *
 111 *   Input:          A string of UTF-16 characters containig a number.
 112 *
 113 *   Output:         32 bit number.
 114 *
 115 *********************************************************************************/
 116 CsrUint32 CsrUtf16StringToUint32(const CsrUtf16String *unicodeString)
 117 {
 118     CsrUint16 numLen, count;
 119     CsrUint32 newNumber = 0;
 120
 121     numLen = (CsrUint16) CsrUtf16StrLen(unicodeString);
 122
 123     if ((numLen > 10) || (numLen == 0) || (unicodeString == NULL)) /*CSRMAX number is 4.294.967.295 */
 124     {
 125         return 0;
 126     }
 127
 128     for (count = 0; count < numLen; count++)
 129     {
 130         CsrUtf16String input = unicodeString[count];
 131         if ((input < 0x30) || (input > 0x39) || ((newNumber == 0x19999999) && (input > 0x35)) || (newNumber > 0x19999999)) /* chars are present or number is too large now causing number to get to large when *10 */
 132         {
 133             return 0;
 134         }
 135
 136         newNumber = (newNumber * 10) + (input - 0x30);
 137     }
 138     return newNumber;
 139 }
 140
 141 /********************************************************************************
 142 *
 143 *   Name:           CsrUtf16MemCpy
 144 *
 145 *   Description:    The function copies count number of 16 bit data elements
 146 *                   from srv to dest.
 147 *
 148 *   Input:          A pointer to an unicoded string.
 149 *
 150 *   Output:         A pointer to an unicoded string.
 151 *
 152 *********************************************************************************/
 153 CsrUtf16String *CsrUtf16MemCpy(CsrUtf16String *dest, const CsrUtf16String *src, CsrUint32 count)
 154 {
 155     return CsrMemCpy((CsrUint8 *) dest, (CsrUint8 *) src, count * sizeof(CsrUtf16String));
 156 }
 157
 158 /********************************************************************************
 159 *
 160 *   Name:           CsrUtf16ConcatenateTexts
 161 *
 162 *   Description:    The function merge the contents of 4 unicoded input pointers
 163 *                   into a new string.
 164 *
 165 *   Input:          4 unicoded input strings (UTF-16).
 166 *
 167 *   Output:         A new unicoded string (UTF-16) containing the combined strings.
 168 *
 169 *********************************************************************************/
 170 CsrUtf16String *CsrUtf16ConcatenateTexts(const CsrUtf16String *inputText1, const CsrUtf16String *inputText2,
 171     const CsrUtf16String *inputText3, const CsrUtf16String *inputText4)
 172 {
 173     CsrUtf16String *outputText;
 174     CsrUint32 textLen, textLen1, textLen2, textLen3, textLen4;
 175
 176     textLen1 = CsrUtf16StrLen(inputText1);
 177     textLen2 = CsrUtf16StrLen(inputText2);
 178     textLen3 = CsrUtf16StrLen(inputText3);
 179     textLen4 = CsrUtf16StrLen(inputText4);
 180
 181     textLen = textLen1 + textLen2 + textLen3 + textLen4;
 182
 183     if (textLen == 0) /*stop here is all lengths are 0*/
 184     {
 185         return NULL;
 186     }
 187
 188     outputText = (CsrUtf16String *) CsrPmemAlloc((textLen + 1) * sizeof(CsrUtf16String)); /* add space for 0-termination*/
 189
 190
 191     if (inputText1 != NULL)
 192     {
 193         CsrUtf16MemCpy(outputText, inputText1, textLen1);
 194     }
 195
 196     if (inputText2 != NULL)
 197     {
 198         CsrUtf16MemCpy(&(outputText[textLen1]), inputText2, textLen2);
 199     }
 200
 201     if (inputText3 != NULL)
 202     {
 203         CsrUtf16MemCpy(&(outputText[textLen1 + textLen2]), inputText3, textLen3);
 204     }
 205
 206     if (inputText4 != NULL)
 207     {
 208         CsrUtf16MemCpy(&(outputText[textLen1 + textLen2 + textLen3]), inputText4, textLen4);
 209     }
 210
 211     outputText[textLen] = '\0';
 212
 213     return outputText;
 214 }
 215
 216 /********************************************************************************
 217 *
 218 *   Name:           CsrUtf16StrLen
 219 *
 220 *   Description:    The function returns the number of 16 bit elements present
 221 *                   in the 0-terminated string.
 222 *
 223 *   Input:          0-terminated string of 16 bit unicoded characters.
 224 *
 225 *   Output:         The number of 16 bit elements in the string.
 226 *
 227 *********************************************************************************/
 228 CsrUint32 CsrUtf16StrLen(const CsrUtf16String *unicodeString)
 229 {
 230     CsrUint32 length;
 231
 232     length = 0;
 233     if (unicodeString != NULL)
 234     {
 235         while (*unicodeString)
 236         {
 237             length++;
 238             unicodeString++;
 239         }
 240     }
 241     return length;
 242 }
 243
 244 /********************************************************************************
 245 *
 246 *   Name:           CsrUtf16String2Utf8
 247 *
 248 *   Description:    The function decodes an UTF-16 string into an UTF8 byte
 249 *                   oriented string.
 250 *
 251 *   Input:          0-terminated UTF-16 string characters.
 252 *
 253 *   Output:         0-terminated string of byte oriented UTF8 coded characters.
 254 *
 255 *********************************************************************************/
 256 CsrUtf8String *CsrUtf16String2Utf8(const CsrUtf16String *source)
 257 {
 258     CsrUtf8String *dest, *destStart = NULL;
 259     CsrUint32 i;
 260     CsrUint32 ch;
 261     CsrUint32 length;
 262     CsrUint32 sourceLength;
 263     CsrUint8 bytes;
 264     CsrBool appendNull = FALSE;
 265
 266     CsrUint8 firstByteMark[5] = {0x00, 0x00, 0xC0, 0xE0, 0xF0};
 267
 268     if (!source)
 269     {
 270         return NULL;
 271     }
 272
 273     length = 0;
 274     sourceLength = CsrUtf16StrLen(source) + 1;
 275
 276     for (i = 0; i < sourceLength; i++)
 277     {
 278         ch = source[i];
 279         if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_HIGH_END)) /* This is a high surrogate */
 280         {
 281             if (i + 1 < sourceLength) /* The low surrogate is in the source */
 282             {
 283                 CsrUint32 ch2 = source[++i];
 284                 if ((ch2 >= UNI_SUR_LOW_START) && (ch2 <= UNI_SUR_LOW_END)) /* And it is a legal low surrogate */
 285                 {
 286                     length += 4;
 287                 }
 288                 else /* It is not a low surrogate, instead put a Unicode
 289                      'REPLACEMENT CHARACTER' (U+FFFD) */
 290                 {
 291                     length += 3;
 292                     i--; /* Substract 1 again as the conversion must continue after the ill-formed code unit */
 293                 }
 294             }
 295             else /* The low surrogate does not exist, instead put a Unicode
 296                  'REPLACEMENT CHARACTER' (U+FFFD), and the null terminated character */
 297             {
 298                 length += 4;
 299             }
 300         }
 301         else if ((ch >= UNI_SUR_LOW_START) && (ch <= UNI_SUR_LOW_END)) /* The value of UTF-16 is not allowed to be in this range, instead put
 302              a Unicode 'REPLACEMENT CHARACTER' (U+FFFD) */
 303         {
 304             length += 3;
 305         }
 306         else /* Figure out how many bytes that are required */
 307         {
 308             if (ch < 0x0080)
 309             {
 310                 length++;
 311             }
 312             else if (ch < 0x0800)
 313             {
 314                 length += 2;
 315             }
 316             else
 317             {
 318                 length += 3;
 319             }
 320         }
 321     }
 322
 323     dest = CsrPmemAlloc(length);
 324     destStart = dest;
 325
 326     for (i = 0; i < sourceLength; i++)
 327     {
 328         ch = source[i];
 329         if ((ch >= UNI_SUR_HIGH_START) && (ch <= UNI_SUR_HIGH_END)) /* This is a high surrogate */
 330         {
 331             if (i + 1 < sourceLength) /* The low surrogate is in the source */
 332             {
 333                 CsrUint32 ch2 = source[++i];
 334                 if ((ch2 >= UNI_SUR_LOW_START) && (ch2 <= UNI_SUR_LOW_END)) /* And it is a legal low surrogate, convert to UTF-32 */
 335                 {
 336                     ch = ((ch - UNI_SUR_HIGH_START) << UNI_HALF_SHIFT) + (ch2 - UNI_SUR_LOW_START) + UNI_HALF_BASE;
 337                 }
 338                 else /* It is not a low surrogate, instead put a Unicode
 339                      'REPLACEMENT CHARACTER' (U+FFFD) */
 340                 {
 341                     ch = UNI_REPLACEMENT_CHAR;
 342                     i--; /* Substract 1 again as the conversion must continue after the ill-formed code unit */
 343                 }
 344             }
 345             else /* The low surrogate does not exist, instead put a Unicode
 346                  'REPLACEMENT CHARACTER' (U+FFFD), and the null terminated character */
 347             {
 348                 ch = UNI_REPLACEMENT_CHAR;
 349                 appendNull = TRUE;
 350             }
 351         }
 352         else if ((ch >= UNI_SUR_LOW_START) && (ch <= UNI_SUR_LOW_END)) /* The value of UTF-16 is not allowed to be in this range, instead put
 353              a Unicode 'REPLACEMENT CHARACTER' (U+FFFD) */
 354         {
 355             ch = UNI_REPLACEMENT_CHAR;
 356         }
 357
 358         /* Figure out how many bytes that are required */
 359         if (ch < (CsrUint32) 0x80)
 360         {
 361             bytes = 1;
 362         }
 363         else if (ch < (CsrUint32) 0x800)
 364         {
 365             bytes = 2;
 366         }
 367         else if (ch < (CsrUint32) 0x10000)
 368         {
 369             bytes = 3;
 370         }
 371         else if (ch < (CsrUint32) 0x110000)
 372         {
 373             bytes = 4;
 374         }
 375         else
 376         {
 377             bytes = 3;
 378             ch = UNI_REPLACEMENT_CHAR;
 379         }
 380
 381         dest += bytes;
 382
 383         switch (bytes) /* Convert character to UTF-8. Note: everything falls through. */
 384         {
 385             case 4:
 386             {
 387                 *--dest = (CsrUint8) ((ch | UNI_BYTEMARK) & UNI_BYTEMASK);
 388                 ch >>= 6;
 389             }
 390             /* FALLTHROUGH */
 391             case 3:
 392             {
 393                 *--dest = (CsrUint8) ((ch | UNI_BYTEMARK) & UNI_BYTEMASK);
 394                 ch >>= 6;
 395             }
 396             /* FALLTHROUGH */
 397             case 2:
 398             {
 399                 *--dest = (CsrUint8) ((ch | UNI_BYTEMARK) & UNI_BYTEMASK);
 400                 ch >>= 6;
 401             }
 402             /* FALLTHROUGH */
 403             case 1:
 404             {
 405                 *--dest = (CsrUint8) (ch | firstByteMark[bytes]);
 406             }
 407             /* FALLTHROUGH */
 408             default:
 409             {
 410                 break;
 411             }
 412         }
 413
 414         dest += bytes;
 415     }
 416
 417     if (appendNull) /* Append the \0 character */
 418     {
 419         *dest = '\0';
 420     }
 421
 422     return destStart;
 423 }
 424
 425 /*****************************************************************************
 426
 427     NAME
 428         isLegalUtf8
 429
 430     DESCRIPTION
 431         Returns TRUE if the given UFT-8 code unit is legal as defined by the
 432         Unicode standard (see Chapter 3: Conformance, Section 3.9: Unicode
 433         Encoding Forms, UTF-8).
 434
 435         This function assumes that the length parameter is unconditionally
 436         correct and that the first byte is already validated by looking it up
 437         in the trailingBytesForUtf8 array, which also reveals the number of
 438         trailing bytes.
 439
 440         Legal code units are composed of one of the following byte sequences:
 441
 442         1st      2nd      3rd      4th
 443         --------------------------------
 444         00-7F
 445         C2-DF    80-BF
 446         E0       A0-BF    80-BF
 447         E1-EC    80-BF    80-BF
 448         ED       80-9F    80-BF
 449         EE-EF    80-BF    80-BF
 450         F0       90-BF    80-BF    80-BF
 451         F1-F3    80-BF    80-BF    80-BF
 452         F4       80-8F    80-BF    80-BF
 453
 454         Please note that this function only checks whether the 2nd, 3rd and
 455         4th bytes fall into the valid ranges.
 456
 457     PARAMETERS
 458         codeUnit - pointer to the first byte of the byte sequence composing
 459             the code unit to test.
 460         length - the number of bytes in the code unit. Valid range is 1 to 4.
 461
 462     RETURNS
 463         TRUE if the given code unit is legal.
 464
 465 *****************************************************************************/
 466 static CsrBool isLegalUtf8(const CsrUtf8String *codeUnit, CsrUint32 length)
 467 {
 468     const CsrUtf8String *srcPtr = codeUnit + length;
 469     CsrUint8 byte;
 470
 471     switch (length) /* Everything falls through except case 1 */
 472     {
 473         case 4:
 474         {
 475             byte = *--srcPtr;
 476             if ((byte < 0x80) || (byte > 0xBF))
 477             {
 478                 return FALSE;
 479             }
 480         }
 481         /* Fallthrough */
 482         case 3:
 483         {
 484             byte = *--srcPtr;
 485             if ((byte < 0x80) || (byte > 0xBF))
 486             {
 487                 return FALSE;
 488             }
 489         }
 490         /* Fallthrough */
 491         case 2:
 492         {
 493             byte = *--srcPtr;
 494             if (byte > 0xBF)
 495             {
 496                 return FALSE;
 497             }
 498
 499             switch (*codeUnit) /* No fallthrough */
 500             {
 501                 case 0xE0:
 502                 {
 503                     if (byte < 0xA0)
 504                     {
 505                         return FALSE;
 506                     }
 507                     break;
 508                 }
 509                 case 0xED:
 510                 {
 511                     if ((byte < 0x80) || (byte > 0x9F))
 512                     {
 513                         return FALSE;
 514                     }
 515                     break;
 516                 }
 517                 case 0xF0:
 518                 {
 519                     if (byte < 0x90)
 520                     {
 521                         return FALSE;
 522                     }
 523                     break;
 524                 }
 525                 case 0xF4:
 526                 {
 527                     if ((byte < 0x80) || (byte > 0x8F))
 528                     {
 529                         return FALSE;
 530                     }
 531                     break;
 532                 }
 533                 default:
 534                 {
 535                     if (byte < 0x80)
 536                     {
 537                         return FALSE;
 538                     }
 539                     break;
 540                 }
 541             }
 542         }
 543         /* Fallthrough */
 544         case 1:
 545         default:
 546             /* The 1st byte and length are assumed correct */
 547             break;
 548     }
 549
 550     return TRUE;
 551 }
 552
 553 /********************************************************************************
 554 *
 555 *   Name:           CsrUtf82Utf16String
 556 *
 557 *   Description:    The function decodes an UTF8 byte oriented string into a
 558 *                   UTF-16string.
 559 *
 560 *   Input:          0-terminated string of byte oriented UTF8 coded characters.
 561 *
 562 *   Output:         0-terminated string of UTF-16 characters.
 563 *
 564 *********************************************************************************/
 565 CsrUtf16String *CsrUtf82Utf16String(const CsrUtf8String *utf8String)
 566 {
 567     CsrSize i, length = 0;
 568     CsrSize sourceLength;
 569     CsrUtf16String *dest = NULL;
 570     CsrUtf16String *destStart = NULL;
 571     CsrInt8 extraBytes2Read;
 572
 573     if (!utf8String)
 574     {
 575         return NULL;
 576     }
 577     sourceLength = CsrStrLen((CsrCharString *) utf8String);
 578
 579     for (i = 0; i < sourceLength; i++)
 580     {
 581         extraBytes2Read = trailingBytesForUtf8[utf8String[i]];
 582
 583         if (extraBytes2Read == -1) /* Illegal byte value, instead put a Unicode 'REPLACEMENT CHARACTER' (U+FFFD) */
 584         {
 585             length += 1;
 586         }
 587         else if (i + extraBytes2Read > sourceLength) /* The extra bytes does not exist, instead put a Unicode 'REPLACEMENT
 588              CHARACTER' (U+FFFD), and the null terminated character */
 589         {
 590             length += 2;
 591             break;
 592         }
 593         else if (isLegalUtf8(&utf8String[i], extraBytes2Read + 1) == FALSE) /* It is not a legal utf-8 character, instead put a Unicode 'REPLACEMENT
 594              CHARACTER' (U+FFFD) */
 595         {
 596             length += 1;
 597         }
 598         else
 599         {
 600             if (utf8String[i] > 0xEF) /* Needs a high and a low surrogate */
 601             {
 602                 length += 2;
 603             }
 604             else
 605             {
 606                 length += 1;
 607             }
 608             i += extraBytes2Read;
 609         }
 610     }
 611
 612     /* Create space for the null terminated character */
 613     dest = (CsrUtf16String *) CsrPmemAlloc((1 + length) * sizeof(CsrUtf16String));
 614     destStart = dest;
 615
 616     for (i = 0; i < sourceLength; i++)
 617     {
 618         extraBytes2Read = trailingBytesForUtf8[utf8String[i]];
 619
 620         if (extraBytes2Read == -1) /* Illegal byte value, instead put a Unicode 'REPLACEMENT CHARACTER' (U+FFFD) */
 621         {
 622             *dest++ = UNI_REPLACEMENT_CHAR;
 623         }
 624         else if (i + extraBytes2Read > sourceLength) /* The extra bytes does not exist, instead put a Unicode 'REPLACEMENT
 625              CHARACTER' (U+FFFD), and the null terminated character */
 626         {
 627             *dest++ = UNI_REPLACEMENT_CHAR;
 628             *dest++ = '\0';
 629             break;
 630         }
 631         else if (isLegalUtf8(&utf8String[i], extraBytes2Read + 1) == FALSE) /* It is not a legal utf-8 character, instead put a Unicode 'REPLACEMENT
 632              CHARACTER' (U+FFFD) */
 633         {
 634             *dest++ = UNI_REPLACEMENT_CHAR;
 635         }
 636         else /* It is legal, convert the character to an CsrUint32 */
 637         {
 638             CsrUint32 ch = 0;
 639
 640             switch (extraBytes2Read) /* Everything falls through */
 641             {
 642                 case 3:
 643                 {
 644                     ch += utf8String[i];
 645                     ch <<= 6;
 646                     i++;
 647                 }
 648                 /* FALLTHROUGH */
 649                 case 2:
 650                 {
 651                     ch += utf8String[i];
 652                     ch <<= 6;
 653                     i++;
 654                 }
 655                 /* FALLTHROUGH */
 656                 case 1:
 657                 {
 658                     ch += utf8String[i];
 659                     ch <<= 6;
 660                     i++;
 661                 }
 662                 /* FALLTHROUGH */
 663                 case 0:
 664                 {
 665                     ch += utf8String[i];
 666                 }
 667                 /* FALLTHROUGH */
 668                 default:
 669                 {
 670                     break;
 671                 }
 672             }
 673
 674             ch -= offsetsFromUtf8[extraBytes2Read];
 675
 676             if (ch <= 0xFFFF) /* Character can be encoded in one CsrUint16 */
 677             {
 678                 *dest++ = (CsrUint16) ch;
 679             }
 680             else /* The character needs two CsrUint16 */
 681             {
 682                 ch -= UNI_HALF_BASE;
 683                 *dest++ = (CsrUint16) ((ch >> UNI_HALF_SHIFT) | UNI_SUR_HIGH_START);
 684                 *dest++ = (CsrUint16) ((ch & 0x03FF) | UNI_SUR_LOW_START);
 685             }
 686         }
 687     }
 688
 689     destStart[length] = 0x00;
 690
 691     return destStart;
 692 }
 693
 694 /********************************************************************************
 695 *
 696 *   Name:           CsrUtf16StrCpy
 697 *
 698 *   Description:    The function copies the contents from one UTF-16 string
 699 *                   to another UTF-16 string.
 700 *
 701 *   Input:          0-terminated UTF-16 string.
 702 *
 703 *   Output:         0-terminated UTF-16 string.
 704 *
 705 *********************************************************************************/
 706 CsrUtf16String *CsrUtf16StrCpy(CsrUtf16String *target, const CsrUtf16String *source)
 707 {
 708     if (source) /* if source is not NULL*/
 709     {
 710         CsrMemCpy(target, source, (CsrUtf16StrLen(source) + 1) * sizeof(CsrUtf16String));
 711         return target;
 712     }
 713     else
 714     {
 715         return NULL;
 716     }
 717 }
 718
 719 /********************************************************************************
 720 *
 721 *   Name:           CsrUtf16StringDuplicate
 722 *
 723 *   Description:    The function allocates a new pointer and copies the input to
 724 *                   the new pointer.
 725 *
 726 *   Input:          0-terminated UTF-16 string.
 727 *
 728 *   Output:         Allocated variable0-terminated UTF-16 string.
 729 *
 730 *********************************************************************************/
 731 CsrUtf16String *CsrUtf16StringDuplicate(const CsrUtf16String *source)
 732 {
 733     CsrUtf16String *target = NULL;
 734     CsrUint32 length;
 735
 736     if (source) /* if source is not NULL*/
 737     {
 738         length = (CsrUtf16StrLen(source) + 1) * sizeof(CsrUtf16String);
 739         target = (CsrUtf16String *) CsrPmemAlloc(length);
 740         CsrMemCpy(target, source, length);
 741     }
 742     return target;
 743 }
 744
 745 /********************************************************************************
 746 *
 747 *   Name:           CsrUtf16StrICmp
 748 *
 749 *   Description:    The function compares two UTF-16 strings.
 750 *
 751 *   Input:          Two 0-terminated UTF-16 string.
 752 *
 753 *   Output:         0: if the strings are identical.
 754 *
 755 *********************************************************************************/
 756 CsrUint16 CsrUtf16StrICmp(const CsrUtf16String *string1, const CsrUtf16String *string2)
 757 {
 758     while (*string1 || *string2)
 759     {
 760         if (CAPITAL(*string1) != CAPITAL(*string2))
 761         {
 762             return *string1 - *string2;
 763         }
 764         string1++;
 765         string2++;
 766     }
 767
 768     return 0;
 769 }
 770
 771 /********************************************************************************
 772 *
 773 *   Name:           CsrUtf16StrNICmp
 774 *
 775 *   Description:    The function compares upto count number of elements in the
 776 *                   two UTF-16 string.
 777 *
 778 *   Input:          Two 0-terminated UTF-16 string and a maximum
 779 *                   number of elements to check.
 780 *
 781 *   Output:         0: if the strings are identical.
 782 *
 783 *********************************************************************************/
 784 CsrUint16 CsrUtf16StrNICmp(const CsrUtf16String *string1, const CsrUtf16String *string2, CsrUint32 count)
 785 {
 786     while ((*string1 || *string2) && count--)
 787     {
 788         if (CAPITAL(*string1) != CAPITAL(*string2))
 789         {
 790             return *string1 - *string2;
 791         }
 792         string1++;
 793         string2++;
 794     }
 795
 796     return 0;
 797 }
 798
 799 /********************************************************************************
 800 *
 801 *   Name:           CsrUtf16String2XML
 802 *
 803 *   Description:    The function converts an unicoded string (UTF-16) into an unicoded XML
 804 *                   string where some special characters are encoded according to
 805 *                   the XML spec.
 806 *
 807 *   Input:          A unicoded string (UTF-16) which is freed.
 808 *
 809 *   Output:         A new unicoded string (UTF-16) containing the converted output.
 810 *
 811 *********************************************************************************/
 812 CsrUtf16String *CsrUtf16String2XML(CsrUtf16String *str)
 813 {
 814     CsrUtf16String *scanString;
 815     CsrUtf16String *outputString = NULL;
 816     CsrUtf16String *resultString = str;
 817     CsrUint32 stringLength = 0;
 818     CsrBool encodeChars = FALSE;
 819
 820     scanString = str;
 821     if (scanString)
 822     {
 823         while (*scanString)
 824         {
 825             if (*scanString == L'&')
 826             {
 827                 stringLength += 5;
 828                 encodeChars = TRUE;
 829             }
 830             else if ((*scanString == L'<') || (*scanString == L'>'))
 831             {
 832                 stringLength += 4;
 833                 encodeChars = TRUE;
 834             }
 835             else
 836             {
 837                 stringLength++;
 838             }
 839
 840             scanString++;
 841         }
 842
 843         stringLength++;
 844
 845         if (encodeChars)
 846         {
 847             resultString = outputString = CsrPmemAlloc(stringLength * sizeof(CsrUtf16String));
 848
 849             scanString = str;
 850
 851             while (*scanString)
 852             {
 853                 if (*scanString == L'&')
 854                 {
 855                     *outputString++ = '&';
 856                     *outputString++ = 'a';
 857                     *outputString++ = 'm';
 858                     *outputString++ = 'p';
 859                     *outputString++ = ';';
 860                 }
 861                 else if (*scanString == L'<')
 862                 {
 863                     *outputString++ = '&';
 864                     *outputString++ = 'l';
 865                     *outputString++ = 't';
 866                     *outputString++ = ';';
 867                 }
 868                 else if (*scanString == L'>')
 869                 {
 870                     *outputString++ = '&';
 871                     *outputString++ = 'g';
 872                     *outputString++ = 't';
 873                     *outputString++ = ';';
 874                 }
 875                 else
 876                 {
 877                     *outputString++ = *scanString;
 878                 }
 879
 880                 scanString++;
 881             }
 882
 883             *outputString++ = 0;
 884
 885             CsrPmemFree(str);
 886         }
 887     }
 888
 889     return resultString;
 890 }
 891
 892 /********************************************************************************
 893 *
 894 *   Name:           CsrXML2Utf16String
 895 *
 896 *   Description:    The function converts an unicoded XML string into an unicoded
 897 *                   string (UTF-16) where some special XML characters are decoded according to
 898 *                   the XML spec.
 899 *
 900 *   Input:          A unicoded XML string which is freed.
 901 *
 902 *   Output:         A new unicoded pointer containing the decoded output.
 903 *
 904 *********************************************************************************/
 905 CsrUtf16String *CsrXML2Utf16String(CsrUtf16String *str)
 906 {
 907     CsrUtf16String *scanString;
 908     CsrUtf16String *outputString = NULL;
 909     CsrUtf16String *resultString = str;
 910     CsrUint32 stringLength = 0;
 911     CsrBool encodeChars = FALSE;
 912
 913     scanString = str;
 914     if (scanString)
 915     {
 916         while (*scanString)
 917         {
 918             if (*scanString == (CsrUtf16String) L'&')
 919             {
 920                 scanString++;
 921
 922                 if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"AMP;", 4))
 923                 {
 924                     scanString += 3;
 925                     encodeChars = TRUE;
 926                 }
 927                 else if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"LT;", 3))
 928                 {
 929                     scanString += 2;
 930                     encodeChars = TRUE;
 931                 }
 932                 else if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"GT;", 3))
 933                 {
 934                     scanString += 2;
 935                     encodeChars = TRUE;
 936                 }
 937                 if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"APOS;", 5))
 938                 {
 939                     scanString += 4;
 940                     encodeChars = TRUE;
 941                 }
 942                 if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"QUOT;", 5))
 943                 {
 944                     scanString += 4;
 945                     encodeChars = TRUE;
 946                 }
 947                 else
 948                 {
 949                     scanString--;
 950                 }
 951             }
 952
 953             stringLength++;
 954             scanString++;
 955         }
 956
 957         stringLength++;
 958
 959         if (encodeChars)
 960         {
 961             resultString = outputString = CsrPmemAlloc(stringLength * sizeof(CsrUtf16String));
 962
 963             scanString = str;
 964
 965             while (*scanString)
 966             {
 967                 if (*scanString == L'&')
 968                 {
 969                     scanString++;
 970
 971                     if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"AMP;", 4))
 972                     {
 973                         *outputString++ = L'&';
 974                         scanString += 3;
 975                     }
 976                     else if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"LT;", 3))
 977                     {
 978                         *outputString++ = L'<';
 979                         scanString += 2;
 980                     }
 981                     else if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"GT;", 3))
 982                     {
 983                         *outputString++ = L'>';
 984                         scanString += 2;
 985                     }
 986                     else if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"APOS;", 5))
 987                     {
 988                         *outputString++ = L'\'';
 989                         scanString += 4;
 990                     }
 991                     else if (!CsrUtf16StrNICmp(scanString, (CsrUtf16String *) L"QUOT;", 5))
 992                     {
 993                         *outputString++ = L'\"';
 994                         scanString += 4;
 995                     }
 996                     else
 997                     {
 998                         *outputString++ = L'&';
 999                         scanString--;
1000                     }
1001                 }
1002                 else
1003                 {
1004                     *outputString++ = *scanString;
1005                 }
1006
1007                 scanString++;
1008             }
1009
1010             *outputString++ = 0;
1011
1012             CsrPmemFree(str);
1013         }
1014     }
1015
1016     return resultString;
1017 }
1018
1019 CsrInt32 CsrUtf8StrCmp(const CsrUtf8String *string1, const CsrUtf8String *string2)
1020 {
1021     return CsrStrCmp((const CsrCharString *) string1, (const CsrCharString *) string2);
1022 }
1023
1024 CsrInt32 CsrUtf8StrNCmp(const CsrUtf8String *string1, const CsrUtf8String *string2, CsrSize count)
1025 {
1026     return CsrStrNCmp((const CsrCharString *) string1, (const CsrCharString *) string2, count);
1027 }
1028
1029 CsrUint32 CsrUtf8StringLengthInBytes(const CsrUtf8String *string)
1030 {
1031     CsrSize length = 0;
1032     if (string)
1033     {
1034         length = CsrStrLen((const CsrCharString *) string);
1035     }
1036     return (CsrUint32) length;
1037 }
1038
1039 CsrUtf8String *CsrUtf8StrCpy(CsrUtf8String *target, const CsrUtf8String *source)
1040 {
1041     return (CsrUtf8String *) CsrStrCpy((CsrCharString *) target, (const CsrCharString *) source);
1042 }
1043
1044 CsrUtf8String *CsrUtf8StrTruncate(CsrUtf8String *target, CsrSize count)
1045 {
1046     CsrSize lastByte = count - 1;
1047
1048     target[count] = '\0';
1049
1050     if (count && (target[lastByte] & 0x80))
1051     {
1052         /* the last byte contains non-ascii char */
1053         if (target[lastByte] & 0x40)
1054         {
1055             /* multi-byte char starting just before truncation */
1056             target[lastByte] = '\0';
1057         }
1058         else if ((target[lastByte - 1] & 0xE0) == 0xE0)
1059         {
1060             /* 3-byte char starting 2 bytes before truncation */
1061             target[lastByte - 1] = '\0';
1062         }
1063         else if ((target[lastByte - 2] & 0xF0) == 0xF0)
1064         {
1065             /* 4-byte char starting 3 bytes before truncation */
1066             target[lastByte - 2] = '\0';
1067         }
1068     }
1069
1070     return target;
1071 }
1072
1073 CsrUtf8String *CsrUtf8StrNCpy(CsrUtf8String *target, const CsrUtf8String *source, CsrSize count)
1074 {
1075     return (CsrUtf8String *) CsrStrNCpy((CsrCharString *) target, (const CsrCharString *) source, count);
1076 }
1077
1078 CsrUtf8String *CsrUtf8StrNCpyZero(CsrUtf8String *target, const CsrUtf8String *source, CsrSize count)
1079 {
1080     CsrStrNCpy((CsrCharString *) target, (const CsrCharString *) source, count);
1081     if (target[count - 1] != '\0')
1082     {
1083         CsrUtf8StrTruncate(target, count - 1);
1084     }
1085     return target;
1086 }
1087
1088 CsrUtf8String *CsrUtf8StrDup(const CsrUtf8String *source)
1089 {
1090     return (CsrUtf8String *) CsrStrDup((const CsrCharString *) source);
1091 }
1092
1093 CsrUtf8String *CsrUtf8StringConcatenateTexts(const CsrUtf8String *inputText1, const CsrUtf8String *inputText2, const CsrUtf8String *inputText3, const CsrUtf8String *inputText4)
1094 {
1095     CsrUtf8String *outputText;
1096     CsrUint32 textLen, textLen1, textLen2, textLen3, textLen4;
1097
1098     textLen1 = CsrUtf8StringLengthInBytes(inputText1);
1099     textLen2 = CsrUtf8StringLengthInBytes(inputText2);
1100     textLen3 = CsrUtf8StringLengthInBytes(inputText3);
1101     textLen4 = CsrUtf8StringLengthInBytes(inputText4);
1102
1103     textLen = textLen1 + textLen2 + textLen3 + textLen4;
1104
1105     if (textLen == 0) /*stop here is all lengths are 0*/
1106     {
1107         return NULL;
1108     }
1109
1110     outputText = (CsrUtf8String *) CsrPmemAlloc((textLen + 1) * sizeof(CsrUtf8String)); /* add space for 0-termination*/
1111
1112
1113     if (inputText1 != NULL)
1114     {
1115         CsrUtf8StrNCpy(outputText, inputText1, textLen1);
1116     }
1117
1118     if (inputText2 != NULL)
1119     {
1120         CsrUtf8StrNCpy(&(outputText[textLen1]), inputText2, textLen2);
1121     }
1122
1123     if (inputText3 != NULL)
1124     {
1125         CsrUtf8StrNCpy(&(outputText[textLen1 + textLen2]), inputText3, textLen3);
1126     }
1127
1128     if (inputText4 != NULL)
1129     {
1130         CsrUtf8StrNCpy(&(outputText[textLen1 + textLen2 + textLen3]), inputText4, textLen4);
1131     }
1132
1133     outputText[textLen] = '\0';
1134
1135     return outputText;
1136 }