Rework utf-8 string concat function a little and make it more widely available.
This commit is contained in:
parent
4b97e604b1
commit
7301b39c8b
|
@ -56,7 +56,6 @@ static int getannounce( tr_info_t * inf, benc_val_t * meta );
|
|||
static char * announceToScrape( const char * announce );
|
||||
static int parseFiles( tr_info_t * inf, benc_val_t * name,
|
||||
benc_val_t * files, benc_val_t * length );
|
||||
static void strcatUTF8( char *, int, const char *, int );
|
||||
|
||||
/***********************************************************************
|
||||
* tr_metainfoParse
|
||||
|
@ -205,14 +204,14 @@ realparse( tr_info_t * inf, const uint8_t * buf, size_t size )
|
|||
val = tr_bencDictFindFirst( &meta, "comment.utf-8", "comment", NULL );
|
||||
if( NULL != val && TYPE_STR == val->type )
|
||||
{
|
||||
strcatUTF8( inf->comment, sizeof( inf->comment ), val->val.s.s, 0 );
|
||||
strlcat_utf8( inf->comment, val->val.s.s, sizeof( inf->comment ), 0 );
|
||||
}
|
||||
|
||||
/* Creator info */
|
||||
val = tr_bencDictFindFirst( &meta, "created by.utf-8", "created by", NULL );
|
||||
if( NULL != val && TYPE_STR == val->type )
|
||||
{
|
||||
strcatUTF8( inf->creator, sizeof( inf->creator ), val->val.s.s, 0 );
|
||||
strlcat_utf8( inf->creator, val->val.s.s, sizeof( inf->creator ), 0 );
|
||||
}
|
||||
|
||||
/* Date created */
|
||||
|
@ -370,11 +369,11 @@ static int getfile( char * buf, int size,
|
|||
return TR_EINVALID;
|
||||
}
|
||||
|
||||
strcatUTF8( buf, size, prefix, 0 );
|
||||
strlcat_utf8( buf, prefix, size, 0 );
|
||||
for( ii = 0; jj > ii; ii++ )
|
||||
{
|
||||
strcatUTF8( buf, size, TR_PATH_DELIMITER_STR, 0 );
|
||||
strcatUTF8( buf, size, list[ii], 1 );
|
||||
strlcat_utf8( buf, TR_PATH_DELIMITER_STR, size, 0 );
|
||||
strlcat_utf8( buf, list[ii], size, TR_PATH_DELIMITER );
|
||||
}
|
||||
free( list );
|
||||
|
||||
|
@ -690,7 +689,8 @@ parseFiles( tr_info_t * inf, benc_val_t * name,
|
|||
return TR_EINVALID;
|
||||
}
|
||||
|
||||
strcatUTF8( inf->name, sizeof( inf->name ), name->val.s.s, 1 );
|
||||
strlcat_utf8( inf->name, name->val.s.s, sizeof( inf->name ),
|
||||
TR_PATH_DELIMITER );
|
||||
if( '\0' == inf->name[0] )
|
||||
{
|
||||
tr_err( "Invalid \"name\" string" );
|
||||
|
@ -744,8 +744,8 @@ parseFiles( tr_info_t * inf, benc_val_t * name,
|
|||
return TR_EINVALID;
|
||||
}
|
||||
|
||||
strcatUTF8( inf->files[0].name, sizeof( inf->files[0].name ),
|
||||
name->val.s.s, 1 );
|
||||
strlcat_utf8( inf->files[0].name, name->val.s.s,
|
||||
sizeof( inf->files[0].name ), TR_PATH_DELIMITER );
|
||||
|
||||
inf->files[0].length = length->val.i;
|
||||
inf->totalSize += length->val.i;
|
||||
|
@ -759,80 +759,3 @@ parseFiles( tr_info_t * inf, benc_val_t * name,
|
|||
|
||||
return TR_OK;
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
* strcatUTF8
|
||||
***********************************************************************
|
||||
* According to the official specification, all strings in the torrent
|
||||
* file are supposed to be UTF-8 encoded. However, there are
|
||||
* non-compliant torrents around... If we encounter an invalid UTF-8
|
||||
* character, we assume it is ISO 8859-1 and convert it to UTF-8.
|
||||
**********************************************************************/
|
||||
#define WANTBYTES( want, got ) \
|
||||
if( (want) > (got) ) { return; } else { (got) -= (want); }
|
||||
static void strcatUTF8( char * s, int len, const char * append, int deslash )
|
||||
{
|
||||
const char * p;
|
||||
|
||||
/* don't overwrite the nul at the end */
|
||||
len--;
|
||||
|
||||
/* Go to the end of the destination string */
|
||||
while( s[0] )
|
||||
{
|
||||
s++;
|
||||
len--;
|
||||
}
|
||||
|
||||
/* Now start appending, converting on the fly if necessary */
|
||||
for( p = append; p[0]; )
|
||||
{
|
||||
/* skip over / if requested */
|
||||
if( deslash && '/' == p[0] )
|
||||
{
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if( !( p[0] & 0x80 ) )
|
||||
{
|
||||
/* ASCII character */
|
||||
WANTBYTES( 1, len );
|
||||
*(s++) = *(p++);
|
||||
continue;
|
||||
}
|
||||
|
||||
if( ( p[0] & 0xE0 ) == 0xC0 && ( p[1] & 0xC0 ) == 0x80 )
|
||||
{
|
||||
/* 2-bytes UTF-8 character */
|
||||
WANTBYTES( 2, len );
|
||||
*(s++) = *(p++); *(s++) = *(p++);
|
||||
continue;
|
||||
}
|
||||
|
||||
if( ( p[0] & 0xF0 ) == 0xE0 && ( p[1] & 0xC0 ) == 0x80 &&
|
||||
( p[2] & 0xC0 ) == 0x80 )
|
||||
{
|
||||
/* 3-bytes UTF-8 character */
|
||||
WANTBYTES( 3, len );
|
||||
*(s++) = *(p++); *(s++) = *(p++);
|
||||
*(s++) = *(p++);
|
||||
continue;
|
||||
}
|
||||
|
||||
if( ( p[0] & 0xF8 ) == 0xF0 && ( p[1] & 0xC0 ) == 0x80 &&
|
||||
( p[2] & 0xC0 ) == 0x80 && ( p[3] & 0xC0 ) == 0x80 )
|
||||
{
|
||||
/* 4-bytes UTF-8 character */
|
||||
WANTBYTES( 4, len );
|
||||
*(s++) = *(p++); *(s++) = *(p++);
|
||||
*(s++) = *(p++); *(s++) = *(p++);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* ISO 8859-1 -> UTF-8 conversion */
|
||||
WANTBYTES( 2, len );
|
||||
*(s++) = 0xC0 | ( ( *p & 0xFF ) >> 6 );
|
||||
*(s++) = 0x80 | ( *(p++) & 0x3F );
|
||||
}
|
||||
}
|
||||
|
|
|
@ -729,3 +729,115 @@ tr_wait( uint64_t delay_milliseconds )
|
|||
usleep( 1000 * delay_milliseconds );
|
||||
#endif
|
||||
}
|
||||
|
||||
#define WANTBYTES( want, got ) \
|
||||
if( (want) > (got) ) { return; } else { (got) -= (want); }
|
||||
void
|
||||
strlcat_utf8( void * dest, const void * src, size_t len, char skip )
|
||||
{
|
||||
char * s = dest;
|
||||
const char * append = src;
|
||||
const char * p;
|
||||
|
||||
/* don't overwrite the nul at the end */
|
||||
len--;
|
||||
|
||||
/* Go to the end of the destination string */
|
||||
while( s[0] )
|
||||
{
|
||||
s++;
|
||||
len--;
|
||||
}
|
||||
|
||||
/* Now start appending, converting on the fly if necessary */
|
||||
for( p = append; p[0]; )
|
||||
{
|
||||
/* skip over the requested character */
|
||||
if( skip == p[0] )
|
||||
{
|
||||
p++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if( !( p[0] & 0x80 ) )
|
||||
{
|
||||
/* ASCII character */
|
||||
WANTBYTES( 1, len );
|
||||
*(s++) = *(p++);
|
||||
continue;
|
||||
}
|
||||
|
||||
if( ( p[0] & 0xE0 ) == 0xC0 && ( p[1] & 0xC0 ) == 0x80 )
|
||||
{
|
||||
/* 2-bytes UTF-8 character */
|
||||
WANTBYTES( 2, len );
|
||||
*(s++) = *(p++); *(s++) = *(p++);
|
||||
continue;
|
||||
}
|
||||
|
||||
if( ( p[0] & 0xF0 ) == 0xE0 && ( p[1] & 0xC0 ) == 0x80 &&
|
||||
( p[2] & 0xC0 ) == 0x80 )
|
||||
{
|
||||
/* 3-bytes UTF-8 character */
|
||||
WANTBYTES( 3, len );
|
||||
*(s++) = *(p++); *(s++) = *(p++);
|
||||
*(s++) = *(p++);
|
||||
continue;
|
||||
}
|
||||
|
||||
if( ( p[0] & 0xF8 ) == 0xF0 && ( p[1] & 0xC0 ) == 0x80 &&
|
||||
( p[2] & 0xC0 ) == 0x80 && ( p[3] & 0xC0 ) == 0x80 )
|
||||
{
|
||||
/* 4-bytes UTF-8 character */
|
||||
WANTBYTES( 4, len );
|
||||
*(s++) = *(p++); *(s++) = *(p++);
|
||||
*(s++) = *(p++); *(s++) = *(p++);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* ISO 8859-1 -> UTF-8 conversion */
|
||||
WANTBYTES( 2, len );
|
||||
*(s++) = 0xC0 | ( ( *p & 0xFF ) >> 6 );
|
||||
*(s++) = 0x80 | ( *(p++) & 0x3F );
|
||||
}
|
||||
}
|
||||
|
||||
size_t
|
||||
bufsize_utf8( const void * vstr, int * changed )
|
||||
{
|
||||
const char * str = vstr;
|
||||
size_t ii, grow;
|
||||
|
||||
if( NULL != changed )
|
||||
*changed = 0;
|
||||
|
||||
ii = 0;
|
||||
grow = 1;
|
||||
while( '\0' != str[ii] )
|
||||
{
|
||||
if( !( str[ii] & 0x80 ) )
|
||||
/* ASCII character */
|
||||
ii++;
|
||||
else if( ( str[ii] & 0xE0 ) == 0xC0 && ( str[ii+1] & 0xC0 ) == 0x80 )
|
||||
/* 2-bytes UTF-8 character */
|
||||
ii += 2;
|
||||
else if( ( str[ii] & 0xF0 ) == 0xE0 && ( str[ii+1] & 0xC0 ) == 0x80 &&
|
||||
( str[ii+2] & 0xC0 ) == 0x80 )
|
||||
/* 3-bytes UTF-8 character */
|
||||
ii += 3;
|
||||
else if( ( str[ii] & 0xF8 ) == 0xF0 && ( str[ii+1] & 0xC0 ) == 0x80 &&
|
||||
( str[ii+2] & 0xC0 ) == 0x80 && ( str[ii+3] & 0xC0 ) == 0x80 )
|
||||
/* 4-bytes UTF-8 character */
|
||||
ii += 4;
|
||||
else
|
||||
{
|
||||
/* ISO 8859-1 -> UTF-8 conversion */
|
||||
ii++;
|
||||
grow++;
|
||||
if( NULL != changed )
|
||||
*changed = 1;
|
||||
}
|
||||
}
|
||||
|
||||
return ii + grow;
|
||||
}
|
||||
|
|
|
@ -88,6 +88,17 @@ uint64_t tr_date( void );
|
|||
/* wait the specified number of milliseconds */
|
||||
void tr_wait( uint64_t delay_milliseconds );
|
||||
|
||||
/***********************************************************************
|
||||
* strlcat_utf8
|
||||
***********************************************************************
|
||||
* According to the official specification, all strings in the torrent
|
||||
* file are supposed to be UTF-8 encoded. However, there are
|
||||
* non-compliant torrents around... If we encounter an invalid UTF-8
|
||||
* character, we assume it is ISO 8859-1 and convert it to UTF-8.
|
||||
**********************************************************************/
|
||||
void strlcat_utf8( void *, const void *, size_t, char );
|
||||
size_t bufsize_utf8( const void *, int * );
|
||||
|
||||
/***
|
||||
****
|
||||
***/
|
||||
|
|
Loading…
Reference in New Issue