(trunk libT) update JSON_parser.c: has UTF16 fix

This commit is contained in:
Charles Kerr 2009-07-13 22:52:08 +00:00
parent 031579b5f3
commit f443d6d113
1 changed files with 49 additions and 40 deletions

View File

@ -32,6 +32,9 @@ SOFTWARE.
For the added features the license above applies also.
Changelog:
2009-05-17
Incorporated benrudiak@googlemail.com fix for UTF16 decoding.
2009-05-14
Fixed float parsing bug related to a locale being set that didn't
use '.' as decimal point character (charles@transmissionbt.com).
@ -65,7 +68,6 @@ SOFTWARE.
#include <locale.h>
#include "JSON_parser.h"
#include "ConvertUTF.h"
#ifdef _MSC_VER
# if _MSC_VER >= 1400 /* Visual Studio 2005 and up */
@ -87,12 +89,13 @@ SOFTWARE.
# define JSON_PARSER_PARSE_BUFFER_SIZE 3500
#endif
typedef unsigned short UTF16;
struct JSON_parser_struct {
JSON_parser_callback callback;
void* ctx;
signed char state, before_comment_state, type, escaped, comment, allow_comments, handle_floats_manually;
UTF16 utf16_decode_buffer[2];
UTF16 utf16_high_surrogate;
long depth;
long top;
signed char* stack;
@ -236,7 +239,7 @@ enum actions
ZX = -19, /* integer detected by zero */
IX = -20, /* integer detected by 1-9 */
EX = -21, /* next char is escaped */
UC = -22, /* Unicode character read */
UC = -22 /* Unicode character read */
};
@ -517,71 +520,77 @@ static int parse_parse_buffer(JSON_parser jc)
return true;
}
#define IS_HIGH_SURROGATE(uc) (((uc) & 0xFC00) == 0xD800)
#define IS_LOW_SURROGATE(uc) (((uc) & 0xFC00) == 0xDC00)
#define DECODE_SURROGATE_PAIR(hi,lo) ((((hi) & 0x3FF) << 10) + ((lo) & 0x3FF) + 0x10000)
static unsigned char utf8_lead_bits[4] = { 0x00, 0xC0, 0xE0, 0xF0 };
static int decode_unicode_char(JSON_parser jc)
{
const unsigned chars = jc->utf16_decode_buffer[0] ? 2 : 1;
int i;
UTF16 *uc = chars == 1 ? &jc->utf16_decode_buffer[0] : &jc->utf16_decode_buffer[1];
UTF16 x;
unsigned uc = 0;
char* p;
int trail_bytes;
assert(jc->parse_buffer_count >= 6);
p = &jc->parse_buffer[jc->parse_buffer_count - 4];
for (i = 0; i < 4; ++i, ++p) {
x = *p;
for (i = 12; i >= 0; i -= 4, ++p) {
unsigned x = *p;
if (x >= 'a') {
x -= ('a' - 10);
} else if (x >= 'A') {
x -= ('A' - 10);
} else {
x &= ~((UTF16) 0x30);
x &= ~0x30u;
}
assert(x < 16);
*uc |= x << ((3u - i) << 2);
uc |= x << i;
}
/* clear UTF-16 char form buffer */
/* clear UTF-16 char from buffer */
jc->parse_buffer_count -= 6;
jc->parse_buffer[jc->parse_buffer_count] = 0;
/* attempt decoding ... */
{
UTF8* dec_start = (UTF8*)&jc->parse_buffer[jc->parse_buffer_count];
UTF8* dec_start_dup = dec_start;
UTF8* dec_end = dec_start + 6;
const UTF16* enc_start = &jc->utf16_decode_buffer[0];
const UTF16* enc_end = enc_start + chars;
const ConversionResult result = ConvertUTF16toUTF8(
&enc_start, enc_end, &dec_start, dec_end, strictConversion);
const size_t new_chars = dec_start - dec_start_dup;
/* was it a surrogate UTF-16 char? */
if (chars == 1 && result == sourceExhausted) {
return true;
}
if (result != conversionOK) {
if (jc->utf16_high_surrogate) {
if (IS_LOW_SURROGATE(uc)) {
uc = DECODE_SURROGATE_PAIR(jc->utf16_high_surrogate, uc);
trail_bytes = 3;
jc->utf16_high_surrogate = 0;
} else {
/* high surrogate without a following low surrogate */
return false;
}
/* NOTE: clear decode buffer to resume string reading,
otherwise we continue to read UTF-16 */
jc->utf16_decode_buffer[0] = 0;
assert(new_chars <= 6);
jc->parse_buffer_count += new_chars;
jc->parse_buffer[jc->parse_buffer_count] = 0;
} else {
if (uc < 0x80) {
trail_bytes = 0;
} else if (uc < 0x800) {
trail_bytes = 1;
} else if (IS_HIGH_SURROGATE(uc)) {
/* save the high surrogate and wait for the low surrogate */
jc->utf16_high_surrogate = uc;
return true;
} else if (IS_LOW_SURROGATE(uc)) {
/* low surrogate without a preceding high surrogate */
return false;
} else {
trail_bytes = 2;
}
}
jc->parse_buffer[jc->parse_buffer_count++] = (char) ((uc >> (trail_bytes * 6)) | utf8_lead_bits[trail_bytes]);
for (i = trail_bytes * 6 - 6; i >= 0; i -= 6) {
jc->parse_buffer[jc->parse_buffer_count++] = (char) (((uc >> i) & 0x3F) | 0x80);
}
jc->parse_buffer[jc->parse_buffer_count] = 0;
return true;
}
@ -694,7 +703,7 @@ JSON_parser_char(JSON_parser jc, int next_char)
return false;
}
/* check if we need to read a second UTF-16 char */
if (jc->utf16_decode_buffer[0]) {
if (jc->utf16_high_surrogate) {
jc->state = D1;
} else {
jc->state = ST;