@@ -452,21 +452,13 @@ static unsigned parse_hex4(const unsigned char * const input)
452
452
* A literal can be one or two sequences of the form \uXXXX */
453
453
static unsigned char utf16_literal_to_utf8 (const unsigned char * const input_pointer , const unsigned char * const input_end , unsigned char * * output_pointer , const unsigned char * * error_pointer )
454
454
{
455
- /* first bytes of UTF8 encoding for a given length in bytes */
456
- static const unsigned char firstByteMark [5 ] =
457
- {
458
- 0x00 , /* should never happen */
459
- 0x00 , /* 0xxxxxxx */
460
- 0xC0 , /* 110xxxxx */
461
- 0xE0 , /* 1110xxxx */
462
- 0xF0 /* 11110xxx */
463
- };
464
-
465
455
long unsigned int codepoint = 0 ;
466
456
unsigned int first_code = 0 ;
467
457
const unsigned char * first_sequence = input_pointer ;
468
458
unsigned char utf8_length = 0 ;
459
+ unsigned char utf8_position = 0 ;
469
460
unsigned char sequence_length = 0 ;
461
+ unsigned char first_byte_mark = 0 ;
470
462
471
463
/* get the first utf16 sequence */
472
464
first_code = parse_hex4 (first_sequence + 2 );
@@ -537,16 +529,19 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
537
529
{
538
530
/* two bytes, encoding 110xxxxx 10xxxxxx */
539
531
utf8_length = 2 ;
532
+ first_byte_mark = 0xC0 ; /* 11000000 */
540
533
}
541
534
else if (codepoint < 0x10000 )
542
535
{
543
536
/* three bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx */
544
537
utf8_length = 3 ;
538
+ first_byte_mark = 0xE0 ; /* 11100000 */
545
539
}
546
540
else if (codepoint <= 0x10FFFF )
547
541
{
548
542
/* four bytes, encoding 1110xxxx 10xxxxxx 10xxxxxx 10xxxxxx */
549
543
utf8_length = 4 ;
544
+ first_byte_mark = 0xF0 ; /* 11110000 */
550
545
}
551
546
else
552
547
{
@@ -556,28 +551,22 @@ static unsigned char utf16_literal_to_utf8(const unsigned char * const input_poi
556
551
}
557
552
558
553
/* encode as utf8 */
559
- switch (utf8_length )
560
- {
561
- case 4 :
562
- /* 10xxxxxx */
563
- (* output_pointer )[3 ] = (unsigned char )((codepoint | 0x80 ) & 0xBF );
564
- codepoint >>= 6 ;
565
- case 3 :
566
- /* 10xxxxxx */
567
- (* output_pointer )[2 ] = (unsigned char )((codepoint | 0x80 ) & 0xBF );
568
- codepoint >>= 6 ;
569
- case 2 :
570
- (* output_pointer )[1 ] = (unsigned char )((codepoint | 0x80 ) & 0xBF );
571
- codepoint >>= 6 ;
572
- case 1 :
573
- /* depending on the length in bytes this determines the
574
- encoding of the first UTF8 byte */
575
- (* output_pointer )[0 ] = (unsigned char )((codepoint | firstByteMark [utf8_length ]) & 0xFF );
576
- break ;
577
- default :
578
- * error_pointer = first_sequence ;
579
- goto fail ;
554
+ for (utf8_position = (unsigned char )(utf8_length - 1 ); utf8_position > 0 ; utf8_position -- )
555
+ {
556
+ /* 10xxxxxx */
557
+ (* output_pointer )[utf8_position ] = (unsigned char )((codepoint | 0x80 ) & 0xBF );
558
+ codepoint >>= 6 ;
580
559
}
560
+ /* encode first byte */
561
+ if (utf8_length > 1 )
562
+ {
563
+ (* output_pointer )[0 ] = (unsigned char )((codepoint | first_byte_mark ) & 0xFF );
564
+ }
565
+ else
566
+ {
567
+ (* output_pointer )[0 ] = (unsigned char )(codepoint & 0x7F );
568
+ }
569
+
581
570
* output_pointer += utf8_length ;
582
571
583
572
return sequence_length ;
0 commit comments