mirror of
https://github.com/transmission/transmission
synced 2025-01-30 10:52:00 +00:00
refactor: use cpputf for utf8 validation and conversion (#2251)
* refactor: use cpputf for utf8 validation and conversion
This commit is contained in:
parent
9516e8a923
commit
bbe49639d6
12 changed files with 82 additions and 835 deletions
4
.gitmodules
vendored
4
.gitmodules
vendored
|
@ -28,3 +28,7 @@
|
|||
[submodule "third-party/googletest"]
|
||||
path = third-party/googletest
|
||||
url = https://github.com/google/googletest.git
|
||||
[submodule "third-party/utfcpp"]
|
||||
branch = post-3.2.1-transmission
|
||||
path = third-party/utfcpp
|
||||
url = https://github.com/transmission/utfcpp
|
||||
|
|
|
@ -142,6 +142,7 @@ if(WIN32)
|
|||
endforeach()
|
||||
endif()
|
||||
|
||||
find_package(UtfCpp)
|
||||
find_package(Threads)
|
||||
find_package(PkgConfig QUIET)
|
||||
|
||||
|
|
|
@ -206,8 +206,6 @@
|
|||
A2A1CB7A0BF29D5500AE959F /* PeerProgressIndicatorCell.mm in Sources */ = {isa = PBXBuildFile; fileRef = A2A1CB780BF29D5500AE959F /* PeerProgressIndicatorCell.mm */; };
|
||||
A2A4E9210DE0F7E9000CE197 /* web.h in Headers */ = {isa = PBXBuildFile; fileRef = A29EBE530DC01FC9006CEE80 /* web.h */; };
|
||||
A2A4E9220DE0F7EB000CE197 /* web.cc in Sources */ = {isa = PBXBuildFile; fileRef = A29EBE520DC01FC9006CEE80 /* web.cc */; };
|
||||
A2A4EA0E0DE106EB000CE197 /* ConvertUTF.c in Sources */ = {isa = PBXBuildFile; fileRef = A2A4EA0A0DE106E8000CE197 /* ConvertUTF.c */; };
|
||||
A2A4EA0F0DE106EE000CE197 /* ConvertUTF.h in Headers */ = {isa = PBXBuildFile; fileRef = A2A4EA0B0DE106E8000CE197 /* ConvertUTF.h */; };
|
||||
A2A6321B0CD9751700E3DA60 /* BadgeView.mm in Sources */ = {isa = PBXBuildFile; fileRef = A2A6321A0CD9751700E3DA60 /* BadgeView.mm */; };
|
||||
A2A7B32A164F87D400B98C65 /* jsonsl.c in Sources */ = {isa = PBXBuildFile; fileRef = A2A7B328164F87D400B98C65 /* jsonsl.c */; };
|
||||
A2A7B32B164F87D400B98C65 /* jsonsl.h in Headers */ = {isa = PBXBuildFile; fileRef = A2A7B329164F87D400B98C65 /* jsonsl.h */; };
|
||||
|
@ -830,8 +828,6 @@
|
|||
A2A1C81D142EC032008C17BF /* nl */ = {isa = PBXFileReference; lastKnownFileType = file.xib; name = nl; path = nl.lproj/GlobalOptionsPopover.xib; sourceTree = "<group>"; };
|
||||
A2A1CB770BF29D5500AE959F /* PeerProgressIndicatorCell.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = PeerProgressIndicatorCell.h; sourceTree = "<group>"; };
|
||||
A2A1CB780BF29D5500AE959F /* PeerProgressIndicatorCell.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = PeerProgressIndicatorCell.mm; sourceTree = "<group>"; };
|
||||
A2A4EA0A0DE106E8000CE197 /* ConvertUTF.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = ConvertUTF.c; sourceTree = "<group>"; };
|
||||
A2A4EA0B0DE106E8000CE197 /* ConvertUTF.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ConvertUTF.h; sourceTree = "<group>"; };
|
||||
A2A632190CD9751700E3DA60 /* BadgeView.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = BadgeView.h; sourceTree = "<group>"; };
|
||||
A2A6321A0CD9751700E3DA60 /* BadgeView.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = BadgeView.mm; sourceTree = "<group>"; };
|
||||
A2A7B328164F87D400B98C65 /* jsonsl.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; path = jsonsl.c; sourceTree = "<group>"; };
|
||||
|
@ -1511,8 +1507,6 @@
|
|||
A2EA52301686AC0D00180493 /* quark.h */,
|
||||
A2AF23C616B44FA0003BC59E /* log.cc */,
|
||||
A2AF23C716B44FA0003BC59E /* log.h */,
|
||||
A2A4EA0B0DE106E8000CE197 /* ConvertUTF.h */,
|
||||
A2A4EA0A0DE106E8000CE197 /* ConvertUTF.c */,
|
||||
4DB74F070E8CD75100AEB1A8 /* wildmat.c */,
|
||||
C1FEE5751C3223CC00D62832 /* watchdir.cc */,
|
||||
C1FEE5761C3223CC00D62832 /* watchdir.h */,
|
||||
|
@ -1884,7 +1878,6 @@
|
|||
C1FEE57B1C3223CC00D62832 /* watchdir.h in Headers */,
|
||||
A2AAB6650DE0D08B00E04DDA /* blocklist.h in Headers */,
|
||||
A2A4E9210DE0F7E9000CE197 /* web.h in Headers */,
|
||||
A2A4EA0F0DE106EE000CE197 /* ConvertUTF.h in Headers */,
|
||||
A25E03E20E4015380086C225 /* tr-getopt.h in Headers */,
|
||||
A21FBBAB0EDA78C300BC3C51 /* bandwidth.h in Headers */,
|
||||
A22CFCA90FC24ED80009BD3E /* tr-dht.h in Headers */,
|
||||
|
@ -2477,7 +2470,6 @@
|
|||
4D4ADFC70DA1631500A68297 /* blocklist.cc in Sources */,
|
||||
A29DF8B90DB2544C00D04E5A /* resume.cc in Sources */,
|
||||
A2A4E9220DE0F7EB000CE197 /* web.cc in Sources */,
|
||||
A2A4EA0E0DE106EB000CE197 /* ConvertUTF.c in Sources */,
|
||||
A292A6E80DFB45FC004B9C0A /* webseed.cc in Sources */,
|
||||
A25E03E30E4015380086C225 /* tr-getopt.cc in Sources */,
|
||||
C1305EBE186A13B100F03351 /* file.cc in Sources */,
|
||||
|
@ -3034,6 +3026,7 @@
|
|||
"third-party/libb64/include",
|
||||
"third-party/libevent/include",
|
||||
"third-party/libutp",
|
||||
"third-party/utfcpp/source",
|
||||
);
|
||||
OTHER_CFLAGS = (
|
||||
"$(inherited)",
|
||||
|
@ -3227,6 +3220,7 @@
|
|||
"third-party/libb64/include",
|
||||
"third-party/libevent/include",
|
||||
"third-party/libutp",
|
||||
"third-party/utfcpp/source",
|
||||
);
|
||||
OTHER_CFLAGS = (
|
||||
"$(inherited)",
|
||||
|
@ -3477,6 +3471,7 @@
|
|||
"third-party/libb64/include",
|
||||
"third-party/libevent/include",
|
||||
"third-party/libutp",
|
||||
"third-party/utfcpp/source",
|
||||
);
|
||||
OTHER_CFLAGS = (
|
||||
"$(inherited)",
|
||||
|
|
1
cmake/FindUtfCpp.cmake
Normal file
1
cmake/FindUtfCpp.cmake
Normal file
|
@ -0,0 +1 @@
|
|||
set(UTFCPP_INCLUDE_DIRS ${CMAKE_SOURCE_DIR}/third-party/utfcpp/source)
|
|
@ -87,7 +87,6 @@ foreach(FILE ${PROJECT_FILES})
|
|||
endforeach()
|
||||
|
||||
set(THIRD_PARTY_FILES
|
||||
ConvertUTF.c
|
||||
jsonsl.c
|
||||
wildmat.c
|
||||
)
|
||||
|
@ -144,7 +143,6 @@ set(${PROJECT_NAME}_PUBLIC_HEADERS
|
|||
)
|
||||
|
||||
set(${PROJECT_NAME}_PRIVATE_HEADERS
|
||||
ConvertUTF.h
|
||||
announcer-common.h
|
||||
announcer.h
|
||||
bandwidth.h
|
||||
|
@ -244,6 +242,7 @@ include_directories(
|
|||
|
||||
include_directories(
|
||||
SYSTEM
|
||||
${UTFCPP_INCLUDE_DIRS}
|
||||
${ZLIB_INCLUDE_DIRS}
|
||||
${CRYPTO_INCLUDE_DIRS}
|
||||
${CURL_INCLUDE_DIRS}
|
||||
|
|
|
@ -1,602 +0,0 @@
|
|||
/*
|
||||
* Copyright 2001-2004 Unicode, Inc.
|
||||
*
|
||||
* Disclaimer
|
||||
*
|
||||
* This source code is provided as is by Unicode, Inc. No claims are
|
||||
* made as to fitness for any particular purpose. No warranties of any
|
||||
* kind are expressed or implied. The recipient agrees to determine
|
||||
* applicability of information provided. If this file has been
|
||||
* purchased on magnetic or optical media from Unicode, Inc., the
|
||||
* sole remedy for any claim will be exchange of defective media
|
||||
* within 90 days of receipt.
|
||||
*
|
||||
* Limitations on Rights to Redistribute This Code
|
||||
*
|
||||
* Unicode, Inc. hereby grants the right to freely use the information
|
||||
* supplied in this file in the creation of products supporting the
|
||||
* Unicode Standard, and to make copies of this file in any form
|
||||
* for internal or external distribution as long as this notice
|
||||
* remains attached.
|
||||
*/
|
||||
|
||||
/* ---------------------------------------------------------------------
|
||||
|
||||
Conversions between UTF32, UTF-16, and UTF-8. Source code file.
|
||||
Author: Mark E. Davis, 1994.
|
||||
Rev History: Rick McGowan, fixes & updates May 2001.
|
||||
Sept 2001: fixed const & error conditions per
|
||||
mods suggested by S. Parent & A. Lillich.
|
||||
June 2002: Tim Dodd added detection and handling of incomplete
|
||||
source sequences, enhanced error detection, added casts
|
||||
to eliminate compiler warnings.
|
||||
July 2003: slight mods to back out aggressive FFFE detection.
|
||||
Jan 2004: updated switches in from-UTF8 conversions.
|
||||
Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
|
||||
May 2006: updated isLegalUTF8Sequence.
|
||||
|
||||
See the header file "ConvertUTF.h" for complete documentation.
|
||||
|
||||
------------------------------------------------------------------------ */
|
||||
|
||||
#ifdef CVTUTF_DEBUG
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
#include <string.h> /* strlen () */
|
||||
|
||||
#include "ConvertUTF.h"
|
||||
|
||||
static const int halfShift = 10; /* used for shifting by 10 bits */
|
||||
|
||||
static const UTF32 halfBase = 0x0010000UL;
|
||||
static const UTF32 halfMask = 0x3FFUL;
|
||||
|
||||
#define UNI_SUR_HIGH_START (UTF32)0xD800
|
||||
#define UNI_SUR_HIGH_END (UTF32)0xDBFF
|
||||
#define UNI_SUR_LOW_START (UTF32)0xDC00
|
||||
#define UNI_SUR_LOW_END (UTF32)0xDFFF
|
||||
#define false 0
|
||||
#define true 1
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
ConversionResult ConvertUTF32toUTF16 (
|
||||
const UTF32** sourceStart, const UTF32* sourceEnd,
|
||||
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
|
||||
ConversionResult result = conversionOK;
|
||||
const UTF32* source = *sourceStart;
|
||||
UTF16* target = *targetStart;
|
||||
while (source < sourceEnd) {
|
||||
UTF32 ch;
|
||||
if (target >= targetEnd) {
|
||||
result = targetExhausted; break;
|
||||
}
|
||||
ch = *source++;
|
||||
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
|
||||
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
|
||||
if (flags == strictConversion) {
|
||||
--source; /* return to the illegal value itself */
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
} else {
|
||||
*target++ = UNI_REPLACEMENT_CHAR;
|
||||
}
|
||||
} else {
|
||||
*target++ = (UTF16)ch; /* normal case */
|
||||
}
|
||||
} else if (ch > UNI_MAX_LEGAL_UTF32) {
|
||||
if (flags == strictConversion) {
|
||||
result = sourceIllegal;
|
||||
} else {
|
||||
*target++ = UNI_REPLACEMENT_CHAR;
|
||||
}
|
||||
} else {
|
||||
/* target is a character in range 0xFFFF - 0x10FFFF. */
|
||||
if (target + 1 >= targetEnd) {
|
||||
--source; /* Back up source pointer! */
|
||||
result = targetExhausted; break;
|
||||
}
|
||||
ch -= halfBase;
|
||||
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
|
||||
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
|
||||
}
|
||||
}
|
||||
*sourceStart = source;
|
||||
*targetStart = target;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
ConversionResult ConvertUTF16toUTF32 (
|
||||
const UTF16** sourceStart, const UTF16* sourceEnd,
|
||||
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
|
||||
ConversionResult result = conversionOK;
|
||||
const UTF16* source = *sourceStart;
|
||||
UTF32* target = *targetStart;
|
||||
UTF32 ch, ch2;
|
||||
while (source < sourceEnd) {
|
||||
const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
|
||||
ch = *source++;
|
||||
/* If we have a surrogate pair, convert to UTF32 first. */
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
|
||||
/* If the 16 bits following the high surrogate are in the source buffer... */
|
||||
if (source < sourceEnd) {
|
||||
ch2 = *source;
|
||||
/* If it's a low surrogate, convert to UTF32. */
|
||||
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
|
||||
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
|
||||
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
|
||||
++source;
|
||||
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
|
||||
--source; /* return to the illegal value itself */
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
}
|
||||
} else { /* We don't have the 16 bits following the high surrogate. */
|
||||
--source; /* return to the high surrogate */
|
||||
result = sourceExhausted;
|
||||
break;
|
||||
}
|
||||
} else if (flags == strictConversion) {
|
||||
/* UTF-16 surrogate values are illegal in UTF-32 */
|
||||
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
|
||||
--source; /* return to the illegal value itself */
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (target >= targetEnd) {
|
||||
source = oldSource; /* Back up source pointer! */
|
||||
result = targetExhausted; break;
|
||||
}
|
||||
*target++ = ch;
|
||||
}
|
||||
*sourceStart = source;
|
||||
*targetStart = target;
|
||||
#ifdef CVTUTF_DEBUG
|
||||
if (result == sourceIllegal) {
|
||||
fprintf (stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
|
||||
fflush (stderr);
|
||||
}
|
||||
#endif
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Index into the table below with the first byte of a UTF-8 sequence to
|
||||
* get the number of trailing bytes that are supposed to follow it.
|
||||
* Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
|
||||
* left as-is for anyone who may want to do such conversion, which was
|
||||
* allowed in earlier algorithms.
|
||||
*/
|
||||
static const char trailingBytesForUTF8[256] = {
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
|
||||
};
|
||||
|
||||
/*
|
||||
* Magic values subtracted from a buffer value during UTF8 conversion.
|
||||
* This table contains as many values as there might be trailing bytes
|
||||
* in a UTF-8 sequence.
|
||||
*/
|
||||
static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
|
||||
0x03C82080UL, 0xFA082080UL, 0x82082080UL };
|
||||
|
||||
/*
|
||||
* Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
|
||||
* into the first byte, depending on how many bytes follow. There are
|
||||
* as many entries in this table as there are UTF-8 sequence types.
|
||||
* (I.e., one byte sequence, two byte... etc.). Remember that sequencs
|
||||
* for *legal* UTF-8 will be 4 or fewer bytes total.
|
||||
*/
|
||||
static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
/* The interface converts a whole buffer to avoid function-call overhead.
|
||||
* Constants have been gathered. Loops & conditionals have been removed as
|
||||
* much as possible for efficiency, in favor of drop-through switches.
|
||||
* (See "Note A" at the bottom of the file for equivalent code.)
|
||||
* If your compiler supports it, the "isLegalUTF8" call can be turned
|
||||
* into an inline function.
|
||||
*/
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
ConversionResult ConvertUTF16toUTF8 (
|
||||
const UTF16** sourceStart, const UTF16* sourceEnd,
|
||||
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
|
||||
ConversionResult result = conversionOK;
|
||||
const UTF16* source = *sourceStart;
|
||||
UTF8* target = *targetStart;
|
||||
while (source < sourceEnd) {
|
||||
UTF32 ch;
|
||||
unsigned short bytesToWrite = 0;
|
||||
const UTF32 byteMask = 0xBF;
|
||||
const UTF32 byteMark = 0x80;
|
||||
const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
|
||||
ch = *source++;
|
||||
/* If we have a surrogate pair, convert to UTF32 first. */
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
|
||||
/* If the 16 bits following the high surrogate are in the source buffer... */
|
||||
if (source < sourceEnd) {
|
||||
UTF32 ch2 = *source;
|
||||
/* If it's a low surrogate, convert to UTF32. */
|
||||
if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
|
||||
ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
|
||||
+ (ch2 - UNI_SUR_LOW_START) + halfBase;
|
||||
++source;
|
||||
} else if (flags == strictConversion) { /* it's an unpaired high surrogate */
|
||||
--source; /* return to the illegal value itself */
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
}
|
||||
} else { /* We don't have the 16 bits following the high surrogate. */
|
||||
--source; /* return to the high surrogate */
|
||||
result = sourceExhausted;
|
||||
break;
|
||||
}
|
||||
} else if (flags == strictConversion) {
|
||||
/* UTF-16 surrogate values are illegal in UTF-32 */
|
||||
if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
|
||||
--source; /* return to the illegal value itself */
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/* Figure out how many bytes the result will require */
|
||||
if (ch < (UTF32)0x80) { bytesToWrite = 1;
|
||||
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
|
||||
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
|
||||
} else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
|
||||
} else { bytesToWrite = 3;
|
||||
ch = UNI_REPLACEMENT_CHAR;
|
||||
}
|
||||
|
||||
target += bytesToWrite;
|
||||
if (target > targetEnd) {
|
||||
source = oldSource; /* Back up source pointer! */
|
||||
target -= bytesToWrite; result = targetExhausted; break;
|
||||
}
|
||||
switch (bytesToWrite) { /* note: everything falls through. */
|
||||
case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
||||
case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
||||
case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
||||
case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
|
||||
}
|
||||
target += bytesToWrite;
|
||||
}
|
||||
*sourceStart = source;
|
||||
*targetStart = target;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Utility routine to tell whether a sequence of bytes is legal UTF-8.
|
||||
* This must be called with the length pre-determined by the first byte.
|
||||
* If not calling this from ConvertUTF8to*, then the length can be set by:
|
||||
* length = trailingBytesForUTF8[*source]+1;
|
||||
* and the sequence is illegal right away if there aren't that many bytes
|
||||
* available.
|
||||
* If presented with a length > 4, this returns false. The Unicode
|
||||
* definition of UTF-8 goes up to 4-byte sequences.
|
||||
*/
|
||||
|
||||
static Boolean isLegalUTF8 (const UTF8 *source, int length) {
|
||||
UTF8 a;
|
||||
const UTF8 *srcptr = source+length;
|
||||
switch (length) {
|
||||
default: return false;
|
||||
/* Everything else falls through when "true"... */
|
||||
case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
||||
case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
||||
case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
|
||||
|
||||
switch (*source) {
|
||||
/* no fall-through in this inner switch */
|
||||
case 0xE0: if (a < 0xA0) return false; break;
|
||||
case 0xED: if (a > 0x9F) return false; break;
|
||||
case 0xF0: if (a < 0x90) return false; break;
|
||||
case 0xF4: if (a > 0x8F) return false; break;
|
||||
default: if (a < 0x80) return false;
|
||||
}
|
||||
|
||||
case 1: if (*source >= 0x80 && *source < 0xC2) return false;
|
||||
}
|
||||
if (*source > 0xF4) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
/*
|
||||
* Exported function to return whether a UTF-8 sequence is legal or not.
|
||||
* This is not used here; it's just exported.
|
||||
*/
|
||||
|
||||
Boolean isLegalUTF8Sequence (const UTF8 *source, const UTF8 *sourceEnd) {
|
||||
int length;
|
||||
if (source == sourceEnd) {
|
||||
return true;
|
||||
}
|
||||
while (true) {
|
||||
length = trailingBytesForUTF8[*source]+1;
|
||||
if (source+length > sourceEnd) {
|
||||
return false;
|
||||
}
|
||||
if (!isLegalUTF8 (source, length)) {
|
||||
return false;
|
||||
}
|
||||
source += length;
|
||||
if (source >= sourceEnd) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a variation of isLegalUTF8Sequence () that behaves like g_utf8_validate ().
|
||||
* In addition to knowing if the sequence is legal, it also tells you the last good character.
|
||||
*/
|
||||
Boolean
|
||||
tr_utf8_validate (const char * str, size_t max_len, const char ** end)
|
||||
{
|
||||
const UTF8* source = (const UTF8*) str;
|
||||
const UTF8* sourceEnd;
|
||||
|
||||
if (max_len == 0)
|
||||
return true;
|
||||
|
||||
if (str == NULL)
|
||||
return false;
|
||||
|
||||
sourceEnd = source + (max_len == ((size_t)-1) ? strlen (str) : max_len);
|
||||
|
||||
if (source == sourceEnd)
|
||||
{
|
||||
if (end != NULL)
|
||||
*end = (const char*) source;
|
||||
return true;
|
||||
}
|
||||
|
||||
for (;;)
|
||||
{
|
||||
const int length = trailingBytesForUTF8[*source] + 1;
|
||||
if (source + length > sourceEnd) {
|
||||
if (end != NULL)
|
||||
*end = (const char*) source;
|
||||
return false;
|
||||
}
|
||||
if (!isLegalUTF8 (source, length)) {
|
||||
if (end != NULL)
|
||||
*end = (const char*) source;
|
||||
return false;
|
||||
}
|
||||
source += length;
|
||||
if (source >= sourceEnd) {
|
||||
if (end != NULL)
|
||||
*end = (const char*) source;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
ConversionResult ConvertUTF8toUTF16 (
|
||||
const UTF8** sourceStart, const UTF8* sourceEnd,
|
||||
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
|
||||
ConversionResult result = conversionOK;
|
||||
const UTF8* source = *sourceStart;
|
||||
UTF16* target = *targetStart;
|
||||
while (source < sourceEnd) {
|
||||
UTF32 ch = 0;
|
||||
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
|
||||
if (extraBytesToRead >= sourceEnd - source) {
|
||||
result = sourceExhausted; break;
|
||||
}
|
||||
/* Do this check whether lenient or strict */
|
||||
if (! isLegalUTF8 (source, extraBytesToRead+1)) {
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* The cases all fall through. See "Note A" below.
|
||||
*/
|
||||
switch (extraBytesToRead) {
|
||||
case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
|
||||
case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
|
||||
case 3: ch += *source++; ch <<= 6;
|
||||
case 2: ch += *source++; ch <<= 6;
|
||||
case 1: ch += *source++; ch <<= 6;
|
||||
case 0: ch += *source++;
|
||||
}
|
||||
ch -= offsetsFromUTF8[extraBytesToRead];
|
||||
|
||||
if (target >= targetEnd) {
|
||||
source -= (extraBytesToRead+1); /* Back up source pointer! */
|
||||
result = targetExhausted; break;
|
||||
}
|
||||
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
|
||||
/* UTF-16 surrogate values are illegal in UTF-32 */
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
|
||||
if (flags == strictConversion) {
|
||||
source -= (extraBytesToRead+1); /* return to the illegal value itself */
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
} else {
|
||||
*target++ = UNI_REPLACEMENT_CHAR;
|
||||
}
|
||||
} else {
|
||||
*target++ = (UTF16)ch; /* normal case */
|
||||
}
|
||||
} else if (ch > UNI_MAX_UTF16) {
|
||||
if (flags == strictConversion) {
|
||||
result = sourceIllegal;
|
||||
source -= (extraBytesToRead+1); /* return to the start */
|
||||
break; /* Bail out; shouldn't continue */
|
||||
} else {
|
||||
*target++ = UNI_REPLACEMENT_CHAR;
|
||||
}
|
||||
} else {
|
||||
/* target is a character in range 0xFFFF - 0x10FFFF. */
|
||||
if (target + 1 >= targetEnd) {
|
||||
source -= (extraBytesToRead+1); /* Back up source pointer! */
|
||||
result = targetExhausted; break;
|
||||
}
|
||||
ch -= halfBase;
|
||||
*target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
|
||||
*target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
|
||||
}
|
||||
}
|
||||
*sourceStart = source;
|
||||
*targetStart = target;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
ConversionResult ConvertUTF32toUTF8 (
|
||||
const UTF32** sourceStart, const UTF32* sourceEnd,
|
||||
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
|
||||
ConversionResult result = conversionOK;
|
||||
const UTF32* source = *sourceStart;
|
||||
UTF8* target = *targetStart;
|
||||
while (source < sourceEnd) {
|
||||
UTF32 ch;
|
||||
unsigned short bytesToWrite = 0;
|
||||
const UTF32 byteMask = 0xBF;
|
||||
const UTF32 byteMark = 0x80;
|
||||
ch = *source++;
|
||||
if (flags == strictConversion) {
|
||||
/* UTF-16 surrogate values are illegal in UTF-32 */
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
|
||||
--source; /* return to the illegal value itself */
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* Figure out how many bytes the result will require. Turn any
|
||||
* illegally large UTF32 things (> Plane 17) into replacement chars.
|
||||
*/
|
||||
if (ch < (UTF32)0x80) { bytesToWrite = 1;
|
||||
} else if (ch < (UTF32)0x800) { bytesToWrite = 2;
|
||||
} else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
|
||||
} else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
|
||||
} else { bytesToWrite = 3;
|
||||
ch = UNI_REPLACEMENT_CHAR;
|
||||
result = sourceIllegal;
|
||||
}
|
||||
|
||||
target += bytesToWrite;
|
||||
if (target > targetEnd) {
|
||||
--source; /* Back up source pointer! */
|
||||
target -= bytesToWrite; result = targetExhausted; break;
|
||||
}
|
||||
switch (bytesToWrite) { /* note: everything falls through. */
|
||||
case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
||||
case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
||||
case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
|
||||
case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
|
||||
}
|
||||
target += bytesToWrite;
|
||||
}
|
||||
*sourceStart = source;
|
||||
*targetStart = target;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
||||
|
||||
ConversionResult ConvertUTF8toUTF32 (
|
||||
const UTF8** sourceStart, const UTF8* sourceEnd,
|
||||
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
|
||||
ConversionResult result = conversionOK;
|
||||
const UTF8* source = *sourceStart;
|
||||
UTF32* target = *targetStart;
|
||||
while (source < sourceEnd) {
|
||||
UTF32 ch = 0;
|
||||
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
|
||||
if (extraBytesToRead >= sourceEnd - source) {
|
||||
result = sourceExhausted; break;
|
||||
}
|
||||
/* Do this check whether lenient or strict */
|
||||
if (! isLegalUTF8 (source, extraBytesToRead+1)) {
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* The cases all fall through. See "Note A" below.
|
||||
*/
|
||||
switch (extraBytesToRead) {
|
||||
case 5: ch += *source++; ch <<= 6;
|
||||
case 4: ch += *source++; ch <<= 6;
|
||||
case 3: ch += *source++; ch <<= 6;
|
||||
case 2: ch += *source++; ch <<= 6;
|
||||
case 1: ch += *source++; ch <<= 6;
|
||||
case 0: ch += *source++;
|
||||
}
|
||||
ch -= offsetsFromUTF8[extraBytesToRead];
|
||||
|
||||
if (target >= targetEnd) {
|
||||
source -= (extraBytesToRead+1); /* Back up the source pointer! */
|
||||
result = targetExhausted; break;
|
||||
}
|
||||
if (ch <= UNI_MAX_LEGAL_UTF32) {
|
||||
/*
|
||||
* UTF-16 surrogate values are illegal in UTF-32, and anything
|
||||
* over Plane 17 (> 0x10FFFF) is illegal.
|
||||
*/
|
||||
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
|
||||
if (flags == strictConversion) {
|
||||
source -= (extraBytesToRead+1); /* return to the illegal value itself */
|
||||
result = sourceIllegal;
|
||||
break;
|
||||
} else {
|
||||
*target++ = UNI_REPLACEMENT_CHAR;
|
||||
}
|
||||
} else {
|
||||
*target++ = ch;
|
||||
}
|
||||
} else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
|
||||
result = sourceIllegal;
|
||||
*target++ = UNI_REPLACEMENT_CHAR;
|
||||
}
|
||||
}
|
||||
*sourceStart = source;
|
||||
*targetStart = target;
|
||||
return result;
|
||||
}
|
||||
|
||||
/* ---------------------------------------------------------------------
|
||||
|
||||
Note A.
|
||||
The fall-through switches in UTF-8 reading code save a
|
||||
temp variable, some decrements & conditionals. The switches
|
||||
are equivalent to the following loop:
|
||||
{
|
||||
int tmpBytesToRead = extraBytesToRead+1;
|
||||
do {
|
||||
ch += *source++;
|
||||
--tmpBytesToRead;
|
||||
if (tmpBytesToRead) ch <<= 6;
|
||||
} while (tmpBytesToRead > 0);
|
||||
}
|
||||
In UTF-8 writing code, the switches on "bytesToWrite" are
|
||||
similarly unrolled loops.
|
||||
|
||||
--------------------------------------------------------------------- */
|
|
@ -1,160 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#ifndef __TRANSMISSION__
|
||||
#error only libtransmission should #include this header.
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Copyright 2001-2004 Unicode, Inc.
|
||||
*
|
||||
* Disclaimer
|
||||
*
|
||||
* This source code is provided as is by Unicode, Inc. No claims are
|
||||
* made as to fitness for any particular purpose. No warranties of any
|
||||
* kind are expressed or implied. The recipient agrees to determine
|
||||
* applicability of information provided. If this file has been
|
||||
* purchased on magnetic or optical media from Unicode, Inc., the
|
||||
* sole remedy for any claim will be exchange of defective media
|
||||
* within 90 days of receipt.
|
||||
*
|
||||
* Limitations on Rights to Redistribute This Code
|
||||
*
|
||||
* Unicode, Inc. hereby grants the right to freely use the information
|
||||
* supplied in this file in the creation of products supporting the
|
||||
* Unicode Standard, and to make copies of this file in any form
|
||||
* for internal or external distribution as long as this notice
|
||||
* remains attached.
|
||||
*/
|
||||
|
||||
/* ---------------------------------------------------------------------
|
||||
|
||||
Conversions between UTF32, UTF-16, and UTF-8. Header file.
|
||||
|
||||
Several funtions are included here, forming a complete set of
|
||||
conversions between the three formats. UTF-7 is not included
|
||||
here, but is handled in a separate source file.
|
||||
|
||||
Each of these routines takes pointers to input buffers and output
|
||||
buffers. The input buffers are const.
|
||||
|
||||
Each routine converts the text between *sourceStart and sourceEnd,
|
||||
putting the result into the buffer between *targetStart and
|
||||
targetEnd. Note: the end pointers are *after* the last item: e.g.
|
||||
* (sourceEnd - 1) is the last item.
|
||||
|
||||
The return result indicates whether the conversion was successful,
|
||||
and if not, whether the problem was in the source or target buffers.
|
||||
(Only the first encountered problem is indicated.)
|
||||
|
||||
After the conversion, *sourceStart and *targetStart are both
|
||||
updated to point to the end of last text successfully converted in
|
||||
the respective buffers.
|
||||
|
||||
Input parameters:
|
||||
sourceStart - pointer to a pointer to the source buffer.
|
||||
The contents of this are modified on return so that
|
||||
it points at the next thing to be converted.
|
||||
targetStart - similarly, pointer to pointer to the target buffer.
|
||||
sourceEnd, targetEnd - respectively pointers to the ends of the
|
||||
two buffers, for overflow checking only.
|
||||
|
||||
These conversion functions take a ConversionFlags argument. When this
|
||||
flag is set to strict, both irregular sequences and isolated surrogates
|
||||
will cause an error. When the flag is set to lenient, both irregular
|
||||
sequences and isolated surrogates are converted.
|
||||
|
||||
Whether the flag is strict or lenient, all illegal sequences will cause
|
||||
an error return. This includes sequences such as: <F4 90 80 80>, <C0 80>,
|
||||
or <A0> in UTF-8, and values above 0x10FFFF in UTF-32. Conformant code
|
||||
must check for illegal sequences.
|
||||
|
||||
When the flag is set to lenient, characters over 0x10FFFF are converted
|
||||
to the replacement character; otherwise (when the flag is set to strict)
|
||||
they constitute an error.
|
||||
|
||||
Output parameters:
|
||||
The value "sourceIllegal" is returned from some routines if the input
|
||||
sequence is malformed. When "sourceIllegal" is returned, the source
|
||||
value will point to the illegal value that caused the problem. E.g.,
|
||||
in UTF-8 when a sequence is malformed, it points to the start of the
|
||||
malformed sequence.
|
||||
|
||||
Author: Mark E. Davis, 1994.
|
||||
Rev History: Rick McGowan, fixes & updates May 2001.
|
||||
Fixes & updates, Sept 2001.
|
||||
|
||||
------------------------------------------------------------------------ */
|
||||
|
||||
/* ---------------------------------------------------------------------
|
||||
The following 4 definitions are compiler-specific.
|
||||
The C standard does not guarantee that wchar_t has at least
|
||||
16 bits, so wchar_t is no less portable than unsigned short!
|
||||
All should be unsigned values to avoid sign extension during
|
||||
bit mask & shift operations.
|
||||
------------------------------------------------------------------------ */
|
||||
|
||||
typedef unsigned long UTF32; /* at least 32 bits */
|
||||
typedef unsigned short UTF16; /* at least 16 bits */
|
||||
typedef unsigned char UTF8; /* typically 8 bits */
|
||||
typedef unsigned char Boolean; /* 0 or 1 */
|
||||
|
||||
/* Some fundamental constants */
|
||||
#define UNI_REPLACEMENT_CHAR (UTF32)0x0000FFFD
|
||||
#define UNI_MAX_BMP (UTF32)0x0000FFFF
|
||||
#define UNI_MAX_UTF16 (UTF32)0x0010FFFF
|
||||
#define UNI_MAX_UTF32 (UTF32)0x7FFFFFFF
|
||||
#define UNI_MAX_LEGAL_UTF32 (UTF32)0x0010FFFF
|
||||
|
||||
typedef enum {
|
||||
conversionOK, /* conversion successful */
|
||||
sourceExhausted, /* partial character in source, but hit end */
|
||||
targetExhausted, /* insuff. room in target for conversion */
|
||||
sourceIllegal /* source sequence is illegal/malformed */
|
||||
} ConversionResult;
|
||||
|
||||
typedef enum {
|
||||
strictConversion = 0,
|
||||
lenientConversion
|
||||
} ConversionFlags;
|
||||
|
||||
/* This is for C++ and does no harm in C */
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
ConversionResult ConvertUTF8toUTF16 (
|
||||
const UTF8** sourceStart, const UTF8* sourceEnd,
|
||||
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
|
||||
|
||||
ConversionResult ConvertUTF16toUTF8 (
|
||||
const UTF16** sourceStart, const UTF16* sourceEnd,
|
||||
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
|
||||
|
||||
ConversionResult ConvertUTF8toUTF32 (
|
||||
const UTF8** sourceStart, const UTF8* sourceEnd,
|
||||
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
||||
|
||||
ConversionResult ConvertUTF32toUTF8 (
|
||||
const UTF32** sourceStart, const UTF32* sourceEnd,
|
||||
UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags);
|
||||
|
||||
ConversionResult ConvertUTF16toUTF32 (
|
||||
const UTF16** sourceStart, const UTF16* sourceEnd,
|
||||
UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags);
|
||||
|
||||
ConversionResult ConvertUTF32toUTF16 (
|
||||
const UTF32** sourceStart, const UTF32* sourceEnd,
|
||||
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags);
|
||||
|
||||
Boolean isLegalUTF8Sequence (const UTF8 *source, const UTF8 *sourceEnd);
|
||||
|
||||
|
||||
/* intended to work the same as g_utf8_validate */
|
||||
Boolean tr_utf8_validate (const char * str, size_t max_len, const char ** end);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/* --------------------------------------------------------------------- */
|
|
@ -51,6 +51,7 @@
|
|||
#include <iconv.h>
|
||||
#endif
|
||||
|
||||
#include <utf8.h>
|
||||
#include <event2/buffer.h>
|
||||
#include <event2/event.h>
|
||||
|
||||
|
@ -58,7 +59,6 @@
|
|||
#include "error.h"
|
||||
#include "error-types.h"
|
||||
#include "file.h"
|
||||
#include "ConvertUTF.h"
|
||||
#include "log.h"
|
||||
#include "mime-types.h"
|
||||
#include "net.h"
|
||||
|
@ -756,29 +756,43 @@ void tr_removeElementFromArray(void* array, size_t index_to_remove, size_t sizeo
|
|||
****
|
||||
***/
|
||||
|
||||
static char* strip_non_utf8(char const* in, size_t inlen)
|
||||
bool tr_utf8_validate(std::string_view sv, char const** good_end)
|
||||
{
|
||||
evbuffer* const buf = evbuffer_new();
|
||||
auto const* begin = std::data(sv);
|
||||
auto const* const end = begin + std::size(sv);
|
||||
auto const* walk = begin;
|
||||
|
||||
char const* end = nullptr;
|
||||
while (!tr_utf8_validate(in, inlen, &end))
|
||||
try
|
||||
{
|
||||
while (walk < end)
|
||||
{
|
||||
utf8::next(walk, end);
|
||||
}
|
||||
}
|
||||
catch (utf8::exception&)
|
||||
{
|
||||
int const good_len = end - in;
|
||||
|
||||
evbuffer_add(buf, in, good_len);
|
||||
inlen -= (good_len + 1);
|
||||
in += (good_len + 1);
|
||||
evbuffer_add(buf, "?", 1);
|
||||
}
|
||||
|
||||
evbuffer_add(buf, in, inlen);
|
||||
return evbuffer_free_to_str(buf, nullptr);
|
||||
if (good_end != nullptr)
|
||||
{
|
||||
*good_end = walk;
|
||||
}
|
||||
|
||||
return walk == end;
|
||||
}
|
||||
|
||||
static char* to_utf8(char const* in, size_t inlen)
|
||||
static char* strip_non_utf8(std::string_view sv)
|
||||
{
|
||||
char* ret = tr_new(char, std::size(sv) + 1);
|
||||
auto const it = utf8::replace_invalid(std::data(sv), std::data(sv) + std::size(sv), ret, '?');
|
||||
*it = '\0';
|
||||
return ret;
|
||||
}
|
||||
|
||||
static char* to_utf8(std::string_view sv)
|
||||
{
|
||||
#ifdef HAVE_ICONV
|
||||
size_t const buflen = inlen * 4 + 10;
|
||||
size_t const buflen = std::size(sv) * 4 + 10;
|
||||
char* out = tr_new(char, buflen);
|
||||
|
||||
auto constexpr Encodings = std::array<char const*, 2>{ "CURRENT", "ISO-8859-15" };
|
||||
|
@ -791,12 +805,12 @@ static char* to_utf8(char const* in, size_t inlen)
|
|||
}
|
||||
|
||||
#ifdef ICONV_SECOND_ARGUMENT_IS_CONST
|
||||
auto const* inbuf = in;
|
||||
auto const* inbuf = std::data(sv);
|
||||
#else
|
||||
auto* inbuf = const_cast<char*>(in);
|
||||
auto* inbuf = const_cast<char*>(std::data(sv));
|
||||
#endif
|
||||
char* outbuf = out;
|
||||
size_t inbytesleft = inlen;
|
||||
size_t inbytesleft = std::size(sv);
|
||||
size_t outbytesleft = buflen;
|
||||
auto const rv = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
|
||||
iconv_close(cd);
|
||||
|
@ -812,30 +826,24 @@ static char* to_utf8(char const* in, size_t inlen)
|
|||
|
||||
#endif
|
||||
|
||||
return strip_non_utf8(in, inlen);
|
||||
return strip_non_utf8(sv);
|
||||
}
|
||||
|
||||
char* tr_utf8clean(std::string_view str)
|
||||
char* tr_utf8clean(std::string_view sv)
|
||||
{
|
||||
char* const ret = tr_utf8_validate(std::data(str), std::size(str), nullptr) ? tr_strndup(std::data(str), std::size(str)) :
|
||||
to_utf8(std::data(str), std::size(str));
|
||||
TR_ASSERT(tr_utf8_validate(ret, strlen(ret), nullptr));
|
||||
char* const ret = tr_utf8_validate(sv, nullptr) ? tr_strvDup(sv) : to_utf8(sv);
|
||||
TR_ASSERT(tr_utf8_validate(ret, nullptr));
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool tr_strvUtf8Validate(std::string_view sv)
|
||||
{
|
||||
return tr_utf8_validate(std::data(sv), std::size(sv), nullptr);
|
||||
}
|
||||
|
||||
std::string tr_strvUtf8Clean(std::string_view sv)
|
||||
{
|
||||
if (tr_strvUtf8Validate(sv))
|
||||
if (tr_utf8_validate(sv, nullptr))
|
||||
{
|
||||
return std::string{ sv };
|
||||
}
|
||||
|
||||
auto* const tmp = to_utf8(std::data(sv), std::size(sv));
|
||||
auto* const tmp = to_utf8(sv);
|
||||
auto ret = std::string{ tmp ? tmp : "" };
|
||||
tr_free(tmp);
|
||||
return ret;
|
||||
|
|
|
@ -132,6 +132,8 @@ void tr_wait_msec(long int delay_milliseconds);
|
|||
*/
|
||||
char* tr_utf8clean(std::string_view str) TR_GNUC_MALLOC;
|
||||
|
||||
bool tr_utf8_validate(std::string_view sv, char const** endptr);
|
||||
|
||||
#ifdef _WIN32
|
||||
|
||||
char* tr_win32_native_to_utf8(wchar_t const* text, int text_size);
|
||||
|
|
|
@ -14,18 +14,19 @@
|
|||
#include <cstdio>
|
||||
#include <cstring>
|
||||
|
||||
#include <utf8.h>
|
||||
#include <event2/buffer.h> /* evbuffer_add() */
|
||||
|
||||
#define LIBTRANSMISSION_VARIANT_MODULE
|
||||
|
||||
#include "transmission.h"
|
||||
#include "ConvertUTF.h"
|
||||
|
||||
#include "jsonsl.h"
|
||||
#include "log.h"
|
||||
#include "tr-assert.h"
|
||||
#include "utils.h"
|
||||
#include "variant.h"
|
||||
#include "variant-common.h"
|
||||
#include "variant.h"
|
||||
|
||||
using namespace std::literals;
|
||||
|
||||
|
@ -224,19 +225,17 @@ static std::string_view extract_escaped_string(char const* in, size_t in_len, st
|
|||
|
||||
if (decode_hex_string(in, &val))
|
||||
{
|
||||
UTF32 str32_buf[2] = { val, 0 };
|
||||
UTF32 const* str32_walk = str32_buf;
|
||||
UTF32 const* str32_end = str32_buf + 1;
|
||||
UTF8 str8_buf[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
UTF8* str8_walk = str8_buf;
|
||||
UTF8* str8_end = str8_buf + 8;
|
||||
|
||||
if (ConvertUTF32toUTF8(&str32_walk, str32_end, &str8_walk, str8_end, {}) == 0)
|
||||
try
|
||||
{
|
||||
evbuffer_add(buf, str8_buf, str8_walk - str8_buf);
|
||||
unescaped = true;
|
||||
auto buf8 = std::array<char, 8>{};
|
||||
auto const it = utf8::append(val, std::data(buf8));
|
||||
evbuffer_add(buf, std::data(buf8), it - std::data(buf8));
|
||||
}
|
||||
|
||||
catch (utf8::exception&)
|
||||
{ // invalid codepoint
|
||||
evbuffer_add(buf, "?", 1);
|
||||
}
|
||||
unescaped = true;
|
||||
in += 6;
|
||||
break;
|
||||
}
|
||||
|
@ -580,20 +579,20 @@ static void jsonStringFunc(tr_variant const* val, void* vdata)
|
|||
}
|
||||
else
|
||||
{
|
||||
auto const* const begin = reinterpret_cast<UTF8 const*>(std::data(sv));
|
||||
auto const* tmp = begin;
|
||||
auto const* end = tmp + std::size(sv);
|
||||
UTF32 buf[1] = { 0 };
|
||||
UTF32* u32 = buf;
|
||||
ConversionResult result = ConvertUTF8toUTF32(&tmp, end, &u32, buf + 1, {});
|
||||
|
||||
if ((result == conversionOK || result == targetExhausted) && tmp != begin)
|
||||
try
|
||||
{
|
||||
outwalk += tr_snprintf(outwalk, outend - outwalk, "\\u%04x", (unsigned int)buf[0]);
|
||||
sv.remove_prefix(tmp - begin - 1);
|
||||
auto* begin8 = std::data(sv);
|
||||
auto* end8 = begin8 + std::size(sv);
|
||||
auto* walk8 = begin8;
|
||||
auto const uch32 = utf8::next(walk8, end8);
|
||||
outwalk += tr_snprintf(outwalk, outend - outwalk, "\\u%04x", uch32);
|
||||
sv.remove_prefix(walk8 - begin8 - 1);
|
||||
}
|
||||
catch (utf8::exception&)
|
||||
{
|
||||
*outwalk++ = '?';
|
||||
}
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,6 @@
|
|||
|
||||
#include "transmission.h"
|
||||
|
||||
#include "ConvertUTF.h" // tr_utf8_validate()
|
||||
#include "crypto-utils.h" // tr_rand_int_weak()
|
||||
#include "platform.h"
|
||||
#include "ptrarray.h"
|
||||
|
@ -182,26 +181,26 @@ TEST_F(UtilsTest, trUtf8clean)
|
|||
in = "\x92\xE0\xE3\xA4\xAD\xAE \xA1\xEB\xE2\xEC \x81\xAE\xA3\xAE\xAC"sv;
|
||||
out = makeString(tr_utf8clean(in));
|
||||
EXPECT_TRUE(std::size(out) == 17 || std::size(out) == 33);
|
||||
EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr));
|
||||
EXPECT_TRUE(tr_utf8_validate(out, nullptr));
|
||||
|
||||
// same string, but utf-8 clean
|
||||
in = "Трудно быть Богом"sv;
|
||||
out = makeString(tr_utf8clean(in));
|
||||
EXPECT_NE(nullptr, out.data());
|
||||
EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr));
|
||||
EXPECT_TRUE(tr_utf8_validate(out, nullptr));
|
||||
EXPECT_EQ(in, out);
|
||||
|
||||
in = "\xF4\x00\x81\x82"sv;
|
||||
out = makeString(tr_utf8clean(in));
|
||||
EXPECT_NE(nullptr, out.data());
|
||||
EXPECT_TRUE(out.size() == 1 || out.size() == 2);
|
||||
EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr));
|
||||
EXPECT_TRUE(tr_utf8_validate(out, nullptr));
|
||||
|
||||
in = "\xF4\x33\x81\x82"sv;
|
||||
out = makeString(tr_utf8clean(in));
|
||||
EXPECT_NE(nullptr, out.data());
|
||||
EXPECT_TRUE(out.size() == 4 || out.size() == 7);
|
||||
EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr));
|
||||
EXPECT_TRUE(tr_utf8_validate(out, nullptr));
|
||||
}
|
||||
|
||||
TEST_F(UtilsTest, trStrvUtf8Clean)
|
||||
|
@ -218,13 +217,13 @@ TEST_F(UtilsTest, trStrvUtf8Clean)
|
|||
in = "\x92\xE0\xE3\xA4\xAD\xAE \xA1\xEB\xE2\xEC \x81\xAE\xA3\xAE\xAC"sv;
|
||||
out = tr_strvUtf8Clean(in);
|
||||
EXPECT_TRUE(std::size(out) == 17 || std::size(out) == 33);
|
||||
EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr));
|
||||
EXPECT_TRUE(tr_utf8_validate(out, nullptr));
|
||||
|
||||
// same string, but utf-8 clean
|
||||
in = "Трудно быть Богом"sv;
|
||||
out = tr_strvUtf8Clean(in);
|
||||
EXPECT_NE(nullptr, out.data());
|
||||
EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr));
|
||||
EXPECT_TRUE(tr_utf8_validate(out, nullptr));
|
||||
EXPECT_EQ(in, out);
|
||||
|
||||
// https://trac.transmissionbt.com/ticket/6064
|
||||
|
@ -236,13 +235,13 @@ TEST_F(UtilsTest, trStrvUtf8Clean)
|
|||
out = tr_strvUtf8Clean(in);
|
||||
EXPECT_NE(nullptr, out.data());
|
||||
EXPECT_TRUE(out.size() == 1 || out.size() == 2);
|
||||
EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr));
|
||||
EXPECT_TRUE(tr_utf8_validate(out, nullptr));
|
||||
|
||||
in = "\xF4\x33\x81\x82"sv;
|
||||
out = tr_strvUtf8Clean(in);
|
||||
EXPECT_NE(nullptr, out.data());
|
||||
EXPECT_TRUE(out.size() == 4 || out.size() == 7);
|
||||
EXPECT_TRUE(tr_utf8_validate(out.c_str(), out.size(), nullptr));
|
||||
EXPECT_TRUE(tr_utf8_validate(out, nullptr));
|
||||
}
|
||||
|
||||
TEST_F(UtilsTest, trParseNumberRange)
|
||||
|
|
1
third-party/utfcpp
vendored
Submodule
1
third-party/utfcpp
vendored
Submodule
|
@ -0,0 +1 @@
|
|||
Subproject commit b85efd66a76caccbe0c186b00cab34df1e4281fa
|
Loading…
Reference in a new issue