Fix UTF-8 decoding routine to comply with Unicode standard
This commit is contained in:
+38
-11
@@ -52,12 +52,22 @@ std::vector<Parser::Action *> Parser::UTF8Parser::input( char c )
|
|||||||
buf[ buf_len++ ] = c;
|
buf[ buf_len++ ] = c;
|
||||||
|
|
||||||
/* This function will only work in a UTF-8 locale. */
|
/* This function will only work in a UTF-8 locale. */
|
||||||
/* This must be asserted by other code. */
|
/* This is asserted in the constructor. */
|
||||||
|
|
||||||
wchar_t pwc;
|
wchar_t pwc;
|
||||||
mbstate_t ps;
|
mbstate_t ps;
|
||||||
memset( &ps, 0, sizeof( ps ) );
|
memset( &ps, 0, sizeof( ps ) );
|
||||||
|
|
||||||
|
size_t total_bytes_parsed = 0;
|
||||||
|
size_t orig_buf_len = buf_len;
|
||||||
|
std::vector<Action *> ret;
|
||||||
|
|
||||||
|
/* this routine is somewhat complicated in order to comply with
|
||||||
|
Unicode 6.0, section 3.9, "Best Practices for using U+FFFD" */
|
||||||
|
|
||||||
|
while ( total_bytes_parsed != orig_buf_len ) {
|
||||||
|
assert( total_bytes_parsed < orig_buf_len );
|
||||||
|
assert( buf_len > 0 );
|
||||||
size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps );
|
size_t bytes_parsed = mbrtowc( &pwc, buf, buf_len, &ps );
|
||||||
|
|
||||||
/* this returns 0 when n = 0! */
|
/* this returns 0 when n = 0! */
|
||||||
@@ -70,24 +80,41 @@ std::vector<Parser::Action *> Parser::UTF8Parser::input( char c )
|
|||||||
assert( buf_len == 1 );
|
assert( buf_len == 1 );
|
||||||
buf_len = 0;
|
buf_len = 0;
|
||||||
pwc = L'\0';
|
pwc = L'\0';
|
||||||
|
bytes_parsed = 1;
|
||||||
} else if ( bytes_parsed == (size_t) -1 ) {
|
} else if ( bytes_parsed == (size_t) -1 ) {
|
||||||
/* invalid sequence, use replacement character and clear buffer */
|
/* invalid sequence, use replacement character and try again with last char */
|
||||||
assert( errno == EILSEQ );
|
assert( errno == EILSEQ );
|
||||||
|
if ( buf_len > 1 ) {
|
||||||
|
buf[ 0 ] = buf[ buf_len - 1 ];
|
||||||
|
bytes_parsed = buf_len - 1;
|
||||||
|
buf_len = 1;
|
||||||
|
} else {
|
||||||
buf_len = 0;
|
buf_len = 0;
|
||||||
|
bytes_parsed = 1;
|
||||||
|
}
|
||||||
pwc = (wchar_t) 0xFFFD;
|
pwc = (wchar_t) 0xFFFD;
|
||||||
} else if ( bytes_parsed == (size_t) -2 ) {
|
} else if ( bytes_parsed == (size_t) -2 ) {
|
||||||
/* can't parse complete multibyte character */
|
/* can't parse incomplete multibyte character */
|
||||||
/* return empty vector */
|
total_bytes_parsed += buf_len;
|
||||||
std::vector<Action *> vec;
|
continue;
|
||||||
return vec;
|
|
||||||
} else if ( bytes_parsed > 0 ) {
|
} else if ( bytes_parsed > 0 ) {
|
||||||
/* parsed into pwc, accept and clear buffer */
|
/* parsed into pwc, accept */
|
||||||
assert( bytes_parsed == buf_len );
|
assert( bytes_parsed <= buf_len );
|
||||||
buf_len = 0;
|
memcpy( buf, buf + bytes_parsed, buf_len - bytes_parsed );
|
||||||
|
buf_len = buf_len - bytes_parsed;
|
||||||
} else {
|
} else {
|
||||||
throw std::string( "Unknown return value from mbrtowc" );
|
throw std::string( "Unknown return value from mbrtowc" );
|
||||||
}
|
}
|
||||||
|
|
||||||
/* we parsed character into pwc */
|
std::vector<Action *> vec = parser.input( pwc );
|
||||||
return parser.input( pwc );
|
for ( std::vector<Action *>::iterator i = vec.begin();
|
||||||
|
i != vec.end();
|
||||||
|
i++ ) {
|
||||||
|
ret.push_back( *i );
|
||||||
|
}
|
||||||
|
|
||||||
|
total_bytes_parsed += bytes_parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user